src/vm/utf8.c

   1 /* src/vm/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
   4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
   5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
   6    J. Wenninger, Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  23    02110-1301, USA.
  24
  25    Contact: cacao@cacaojvm.org
  26
  27    Authors: Reinhard Grafl
  28
  29    Changes: Mark Probst
  30             Andreas Krall
  31             Christian Thalinger
  32                         Edwin Steiner
  33
  34    $Id: utf8.c 5821 2006-10-24 16:41:54Z edwin $
  35
  36 */
  37
  38
  39 #include "config.h"
  40
  41 #include <string.h>
  42 #include <assert.h>
  43
  44 #include "vm/types.h"
  45
  46 #include "mm/memory.h"
  47
  48 #if defined(ENABLE_THREADS)
  49 # include "threads/native/lock.h"
  50 #else
  51 # include "threads/none/lock.h"
  52 #endif
  53
  54 #include "vm/builtin.h"
  55 #include "vm/exceptions.h"
  56 #include "vm/hashtable.h"
  57 #include "vm/options.h"
  58 #include "vm/statistics.h"
  59 #include "vm/stringlocal.h"
  60 #include "vm/utf8.h"
  61
  62
  63 /* global variables ***********************************************************/
  64
  65 /* hashsize must be power of 2 */
  66
  67 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  68
  69 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  70
  71
  72 /* utf-symbols for pointer comparison of frequently used strings **************/
  73
  74 utf *utf_java_lang_Object;
  75
  76 utf *utf_java_lang_Class;
  77 utf *utf_java_lang_ClassLoader;
  78 utf *utf_java_lang_Cloneable;
  79 utf *utf_java_lang_SecurityManager;
  80 utf *utf_java_lang_String;
  81 utf *utf_java_lang_System;
  82 utf *utf_java_lang_ThreadGroup;
  83 utf *utf_java_io_Serializable;
  84
  85 utf *utf_java_lang_Throwable;
  86 utf *utf_java_lang_VMThrowable;
  87 utf *utf_java_lang_Error;
  88 utf *utf_java_lang_AbstractMethodError;
  89 utf *utf_java_lang_LinkageError;
  90 utf *utf_java_lang_NoClassDefFoundError;
  91 utf *utf_java_lang_NoSuchMethodError;
  92 utf *utf_java_lang_OutOfMemoryError;
  93
  94 utf *utf_java_lang_Exception;
  95 utf *utf_java_lang_ClassCastException;
  96 utf *utf_java_lang_ClassNotFoundException;
  97 utf *utf_java_lang_IllegalArgumentException;
  98 utf *utf_java_lang_IllegalMonitorStateException;
  99
 100 utf *utf_java_lang_NullPointerException;
 101
 102 utf* utf_java_lang_Void;
 103 utf* utf_java_lang_Boolean;
 104 utf* utf_java_lang_Byte;
 105 utf* utf_java_lang_Character;
 106 utf* utf_java_lang_Short;
 107 utf* utf_java_lang_Integer;
 108 utf* utf_java_lang_Long;
 109 utf* utf_java_lang_Float;
 110 utf* utf_java_lang_Double;
 111
 112 utf *utf_java_lang_StackTraceElement;
 113 utf *utf_java_lang_reflect_Constructor;
 114 utf *utf_java_lang_reflect_Field;
 115 utf *utf_java_lang_reflect_Method;
 116 utf *utf_java_util_Vector;
 117
 118 utf *utf_InnerClasses;                  /* InnerClasses                       */
 119 utf *utf_ConstantValue;                 /* ConstantValue                      */
 120 utf *utf_Code;                          /* Code                               */
 121 utf *utf_Exceptions;                    /* Exceptions                         */
 122 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 123 utf *utf_SourceFile;                    /* SourceFile                         */
 124
 125 utf *utf_init;                          /* <init>                             */
 126 utf *utf_clinit;                        /* <clinit>                           */
 127 utf *utf_clone;                         /* clone                              */
 128 utf *utf_finalize;                      /* finalize                           */
 129 utf *utf_run;                           /* run                                */
 130
 131 utf *utf_add;
 132 utf *utf_remove;
 133 utf *utf_removeThread;
 134 utf *utf_put;
 135 utf *utf_get;
 136 utf *utf_value;
 137
 138 utf *utf_fillInStackTrace;
 139 utf *utf_getSystemClassLoader;
 140 utf *utf_loadClass;
 141 utf *utf_printStackTrace;
 142
 143 utf *utf_Z;                             /* Z                                  */
 144 utf *utf_B;                             /* B                                  */
 145 utf *utf_C;                             /* C                                  */
 146 utf *utf_S;                             /* S                                  */
 147 utf *utf_I;                             /* I                                  */
 148 utf *utf_J;                             /* J                                  */
 149 utf *utf_F;                             /* F                                  */
 150 utf *utf_D;                             /* D                                  */
 151
 152 utf *utf_void__void;                    /* ()V                                */
 153 utf *utf_boolean__void;                 /* (Z)V                               */
 154 utf *utf_byte__void;                    /* (B)V                               */
 155 utf *utf_char__void;                    /* (C)V                               */
 156 utf *utf_short__void;                   /* (S)V                               */
 157 utf *utf_int__void;                     /* (I)V                               */
 158 utf *utf_long__void;                    /* (J)V                               */
 159 utf *utf_float__void;                   /* (F)V                               */
 160 utf *utf_double__void;                  /* (D)V                               */
 161
 162 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 163 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 164 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 165 utf *utf_java_lang_Object__java_lang_Object;
 166 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 167 utf *utf_java_lang_String__java_lang_Class;
 168 utf *utf_java_lang_Thread__V;           /* (Ljava/lang/Thread;)V              */
 169 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 170
 171 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 172 utf *utf_null;
 173 utf *array_packagename;
 174
 175
 176 /* utf_init ********************************************************************
 177
 178    Initializes the utf8 subsystem.
 179
 180 *******************************************************************************/
 181
 182 bool utf8_init(void)
 183 {
 184         /* create utf8 hashtable */
 185
 186         hashtable_utf = NEW(hashtable);
 187
 188         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 189
 190 #if defined(ENABLE_STATISTICS)
 191         if (opt_stat)
 192                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 193 #endif
 194
 195         /* create utf-symbols for pointer comparison of frequently used strings */
 196
 197         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 198
 199         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 200         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 201         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 202         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 203         utf_java_lang_String           = utf_new_char("java/lang/String");
 204         utf_java_lang_System           = utf_new_char("java/lang/System");
 205         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 206         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 207
 208         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
 209         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
 210         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
 211
 212         utf_java_lang_AbstractMethodError =
 213                 utf_new_char(string_java_lang_AbstractMethodError);
 214
 215         utf_java_lang_LinkageError =
 216                 utf_new_char(string_java_lang_LinkageError);
 217
 218         utf_java_lang_NoClassDefFoundError =
 219                 utf_new_char(string_java_lang_NoClassDefFoundError);
 220
 221         utf_java_lang_NoSuchMethodError =
 222                 utf_new_char(string_java_lang_NoSuchMethodError);
 223
 224         utf_java_lang_OutOfMemoryError =
 225                 utf_new_char(string_java_lang_OutOfMemoryError);
 226
 227         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
 228
 229         utf_java_lang_ClassCastException =
 230                 utf_new_char(string_java_lang_ClassCastException);
 231
 232         utf_java_lang_ClassNotFoundException =
 233                 utf_new_char(string_java_lang_ClassNotFoundException);
 234
 235         utf_java_lang_IllegalArgumentException =
 236                 utf_new_char(string_java_lang_IllegalArgumentException);
 237
 238         utf_java_lang_IllegalMonitorStateException =
 239                 utf_new_char(string_java_lang_IllegalMonitorStateException);
 240
 241         utf_java_lang_NullPointerException =
 242                 utf_new_char(string_java_lang_NullPointerException);
 243
 244         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 245         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 246         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 247         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 248         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 249         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 250         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 251         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 252         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 253
 254         utf_java_lang_StackTraceElement =
 255                 utf_new_char("java/lang/StackTraceElement");
 256
 257         utf_java_lang_reflect_Constructor =
 258                 utf_new_char("java/lang/reflect/Constructor");
 259
 260         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 261         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 262         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 263
 264         utf_InnerClasses               = utf_new_char("InnerClasses");
 265         utf_ConstantValue              = utf_new_char("ConstantValue");
 266         utf_Code                       = utf_new_char("Code");
 267         utf_Exceptions                 = utf_new_char("Exceptions");
 268         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 269         utf_SourceFile                 = utf_new_char("SourceFile");
 270
 271         utf_init                           = utf_new_char("<init>");
 272         utf_clinit                         = utf_new_char("<clinit>");
 273         utf_clone                      = utf_new_char("clone");
 274         utf_finalize                   = utf_new_char("finalize");
 275         utf_run                        = utf_new_char("run");
 276
 277         utf_add                        = utf_new_char("add");
 278         utf_remove                     = utf_new_char("remove");
 279         utf_removeThread               = utf_new_char("removeThread");
 280         utf_put                        = utf_new_char("put");
 281         utf_get                        = utf_new_char("get");
 282         utf_value                      = utf_new_char("value");
 283
 284         utf_printStackTrace            = utf_new_char("printStackTrace");
 285         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 286         utf_loadClass                  = utf_new_char("loadClass");
 287         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 288
 289         utf_Z                          = utf_new_char("Z");
 290         utf_B                          = utf_new_char("B");
 291         utf_C                          = utf_new_char("C");
 292         utf_S                          = utf_new_char("S");
 293         utf_I                          = utf_new_char("I");
 294         utf_J                          = utf_new_char("J");
 295         utf_F                          = utf_new_char("F");
 296         utf_D                          = utf_new_char("D");
 297
 298         utf_void__void                 = utf_new_char("()V");
 299         utf_boolean__void              = utf_new_char("(Z)V");
 300         utf_byte__void                 = utf_new_char("(B)V");
 301         utf_char__void                 = utf_new_char("(C)V");
 302         utf_short__void                = utf_new_char("(S)V");
 303         utf_int__void                  = utf_new_char("(I)V");
 304         utf_long__void                 = utf_new_char("(J)V");
 305         utf_float__void                = utf_new_char("(F)V");
 306         utf_double__void               = utf_new_char("(D)V");
 307         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 308         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 309
 310         utf_void__java_lang_ClassLoader =
 311                 utf_new_char("()Ljava/lang/ClassLoader;");
 312
 313         utf_java_lang_Object__java_lang_Object =
 314                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 315
 316         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 317
 318         utf_java_lang_String__java_lang_Class =
 319                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 320
 321         utf_java_lang_Thread__V        = utf_new_char("(Ljava/lang/Thread;)V");
 322         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 323
 324         utf_null                       = utf_new_char("null");
 325         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 326         array_packagename              = utf_new_char("\t<the array package>");
 327
 328         /* everything's ok */
 329
 330         return true;
 331 }
 332
 333
 334 /* utf_hashkey *****************************************************************
 335
 336    The hashkey is computed from the utf-text by using up to 8
 337    characters.  For utf-symbols longer than 15 characters 3 characters
 338    are taken from the beginning and the end, 2 characters are taken
 339    from the middle.
 340
 341 *******************************************************************************/
 342
 343 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 344 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 345
 346 u4 utf_hashkey(const char *text, u4 length)
 347 {
 348         const char *start_pos = text;       /* pointer to utf text                */
 349         u4 a;
 350
 351         switch (length) {
 352         case 0: /* empty string */
 353                 return 0;
 354
 355         case 1: return fbs(0);
 356         case 2: return fbs(0) ^ nbs(3);
 357         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 358         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 359         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 360         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 361         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 362         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 363
 364         case 9:
 365                 a = fbs(0);
 366                 a ^= nbs(1);
 367                 a ^= nbs(2);
 368                 text++;
 369                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 370
 371         case 10:
 372                 a = fbs(0);
 373                 text++;
 374                 a ^= nbs(2);
 375                 a ^= nbs(3);
 376                 a ^= nbs(4);
 377                 text++;
 378                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 379
 380         case 11:
 381                 a = fbs(0);
 382                 text++;
 383                 a ^= nbs(2);
 384                 a ^= nbs(3);
 385                 a ^= nbs(4);
 386                 text++;
 387                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 388
 389         case 12:
 390                 a = fbs(0);
 391                 text += 2;
 392                 a ^= nbs(2);
 393                 a ^= nbs(3);
 394                 text++;
 395                 a ^= nbs(5);
 396                 a ^= nbs(6);
 397                 a ^= nbs(7);
 398                 text++;
 399                 return a ^ nbs(9) ^ nbs(10);
 400
 401         case 13:
 402                 a = fbs(0);
 403                 a ^= nbs(1);
 404                 text++;
 405                 a ^= nbs(3);
 406                 a ^= nbs(4);
 407                 text += 2;
 408                 a ^= nbs(7);
 409                 a ^= nbs(8);
 410                 text += 2;
 411                 return a ^ nbs(9) ^ nbs(10);
 412
 413         case 14:
 414                 a = fbs(0);
 415                 text += 2;
 416                 a ^= nbs(3);
 417                 a ^= nbs(4);
 418                 text += 2;
 419                 a ^= nbs(7);
 420                 a ^= nbs(8);
 421                 text += 2;
 422                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 423
 424         case 15:
 425                 a = fbs(0);
 426                 text += 2;
 427                 a ^= nbs(3);
 428                 a ^= nbs(4);
 429                 text += 2;
 430                 a ^= nbs(7);
 431                 a ^= nbs(8);
 432                 text += 2;
 433                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 434
 435         default:  /* 3 characters from beginning */
 436                 a = fbs(0);
 437                 text += 2;
 438                 a ^= nbs(3);
 439                 a ^= nbs(4);
 440
 441                 /* 2 characters from middle */
 442                 text = start_pos + (length / 2);
 443                 a ^= fbs(5);
 444                 text += 2;
 445                 a ^= nbs(6);
 446
 447                 /* 3 characters from end */
 448                 text = start_pos + length - 4;
 449
 450                 a ^= fbs(7);
 451                 text++;
 452
 453                 return a ^ nbs(10) ^ nbs(11);
 454     }
 455 }
 456
 457 /* utf_full_hashkey ************************************************************
 458
 459    This function computes a hash value using all bytes in the string.
 460
 461    The algorithm is the "One-at-a-time" algorithm as published
 462    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 463
 464 *******************************************************************************/
 465
 466 u4 utf_full_hashkey(const char *text, u4 length)
 467 {
 468         register const unsigned char *p = (const unsigned char *) text;
 469         register u4 hash;
 470         register u4 i;
 471
 472         hash = 0;
 473         for (i=length; i--;)
 474         {
 475             hash += *p++;
 476             hash += (hash << 10);
 477             hash ^= (hash >> 6);
 478         }
 479         hash += (hash << 3);
 480         hash ^= (hash >> 11);
 481         hash += (hash << 15);
 482
 483         return hash;
 484 }
 485
 486 /* unicode_hashkey *************************************************************
 487
 488    Compute the hashkey of a unicode string.
 489
 490 *******************************************************************************/
 491
 492 u4 unicode_hashkey(u2 *text, u2 len)
 493 {
 494         return utf_hashkey((char *) text, len);
 495 }
 496
 497
 498 /* utf_new *********************************************************************
 499
 500    Creates a new utf-symbol, the text of the symbol is passed as a
 501    u1-array. The function searches the utf-hashtable for a utf-symbol
 502    with this text. On success the element returned, otherwise a new
 503    hashtable element is created.
 504
 505    If the number of entries in the hashtable exceeds twice the size of
 506    the hashtable slots a reorganization of the hashtable is done and
 507    the utf symbols are copied to a new hashtable with doubled size.
 508
 509 *******************************************************************************/
 510
 511 utf *utf_new(const char *text, u2 length)
 512 {
 513         u4 key;                             /* hashkey computed from utf-text     */
 514         u4 slot;                            /* slot in hashtable                  */
 515         utf *u;                             /* hashtable element                  */
 516         u2 i;
 517
 518         LOCK_MONITOR_ENTER(hashtable_utf->header);
 519
 520 #if defined(ENABLE_STATISTICS)
 521         if (opt_stat)
 522                 count_utf_new++;
 523 #endif
 524
 525         key  = utf_hashkey(text, length);
 526         slot = key & (hashtable_utf->size - 1);
 527         u    = hashtable_utf->ptr[slot];
 528
 529         /* search external hash chain for utf-symbol */
 530
 531         while (u) {
 532                 if (u->blength == length) {
 533                         /* compare text of hashtable elements */
 534
 535                         for (i = 0; i < length; i++)
 536                                 if (text[i] != u->text[i])
 537                                         goto nomatch;
 538
 539 #if defined(ENABLE_STATISTICS)
 540                         if (opt_stat)
 541                                 count_utf_new_found++;
 542 #endif
 543
 544                         /* symbol found in hashtable */
 545
 546                         LOCK_MONITOR_EXIT(hashtable_utf->header);
 547
 548                         return u;
 549                 }
 550
 551         nomatch:
 552                 u = u->hashlink; /* next element in external chain */
 553         }
 554
 555 #if defined(ENABLE_STATISTICS)
 556         if (opt_stat)
 557                 count_utf_len += sizeof(utf) + length + 1;
 558 #endif
 559
 560         /* location in hashtable found, create new utf element */
 561         u = NEW(utf);
 562         u->blength  = length;               /* length in bytes of utfstring       */
 563         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 564         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 565
 566         memcpy(u->text, text, length);      /* copy utf-text                      */
 567         u->text[length] = '\0';
 568
 569         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 570         hashtable_utf->entries++;           /* update number of entries           */
 571
 572         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 573
 574         /* reorganization of hashtable, average length of the external
 575            chains is approx. 2 */
 576
 577                 hashtable *newhash;                              /* the new hashtable */
 578                 u4         i;
 579                 utf       *u;
 580                 utf       *nextu;
 581                 u4         slot;
 582
 583                 /* create new hashtable, double the size */
 584
 585                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 586
 587 #if defined(ENABLE_STATISTICS)
 588                 if (opt_stat)
 589                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 590 #endif
 591
 592                 /* transfer elements to new hashtable */
 593
 594                 for (i = 0; i < hashtable_utf->size; i++) {
 595                         u = hashtable_utf->ptr[i];
 596
 597                         while (u) {
 598                                 nextu = u->hashlink;
 599                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 600
 601                                 u->hashlink = (utf *) newhash->ptr[slot];
 602                                 newhash->ptr[slot] = u;
 603
 604                                 /* follow link in external hash chain */
 605
 606                                 u = nextu;
 607                         }
 608                 }
 609
 610                 /* dispose old table */
 611
 612                 hashtable_free(hashtable_utf);
 613
 614                 hashtable_utf = newhash;
 615         }
 616
 617         LOCK_MONITOR_EXIT(hashtable_utf->header);
 618
 619         return u;
 620 }
 621
 622
 623 /* utf_new_u2 ******************************************************************
 624
 625    Make utf symbol from u2 array, if isclassname is true '.' is
 626    replaced by '/'.
 627
 628 *******************************************************************************/
 629
 630 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 631 {
 632         char *buffer;                   /* memory buffer for  unicode characters  */
 633         char *pos;                      /* pointer to current position in buffer  */
 634         u4 left;                        /* unicode characters left                */
 635         u4 buflength;                   /* utf length in bytes of the u2 array    */
 636         utf *result;                    /* resulting utf-string                   */
 637         int i;
 638
 639         /* determine utf length in bytes and allocate memory */
 640
 641         buflength = u2_utflength(unicode_pos, unicode_length);
 642         buffer    = MNEW(char, buflength);
 643
 644         left = buflength;
 645         pos  = buffer;
 646
 647         for (i = 0; i++ < unicode_length; unicode_pos++) {
 648                 /* next unicode character */
 649                 u2 c = *unicode_pos;
 650
 651                 if ((c != 0) && (c < 0x80)) {
 652                         /* 1 character */
 653                         left--;
 654                 if ((int) left < 0) break;
 655                         /* convert classname */
 656                         if (isclassname && c == '.')
 657                                 *pos++ = '/';
 658                         else
 659                                 *pos++ = (char) c;
 660
 661                 } else if (c < 0x800) {
 662                         /* 2 characters */
 663                 unsigned char high = c >> 6;
 664                 unsigned char low  = c & 0x3F;
 665                         left = left - 2;
 666                 if ((int) left < 0) break;
 667                 *pos++ = high | 0xC0;
 668                 *pos++ = low  | 0x80;
 669
 670                 } else {
 671                 /* 3 characters */
 672                 char low  = c & 0x3f;
 673                 char mid  = (c >> 6) & 0x3F;
 674                 char high = c >> 12;
 675                         left = left - 3;
 676                 if ((int) left < 0) break;
 677                 *pos++ = high | 0xE0;
 678                 *pos++ = mid  | 0x80;
 679                 *pos++ = low  | 0x80;
 680                 }
 681         }
 682
 683         /* insert utf-string into symbol-table */
 684         result = utf_new(buffer,buflength);
 685
 686         MFREE(buffer, char, buflength);
 687
 688         return result;
 689 }
 690
 691
 692 /* utf_new_char ****************************************************************
 693
 694    Creates a new utf symbol, the text for this symbol is passed as a
 695    c-string ( = char* ).
 696
 697 *******************************************************************************/
 698
 699 utf *utf_new_char(const char *text)
 700 {
 701         return utf_new(text, strlen(text));
 702 }
 703
 704
 705 /* utf_new_char_classname ******************************************************
 706
 707    Creates a new utf symbol, the text for this symbol is passed as a
 708    c-string ( = char* ) "." characters are going to be replaced by
 709    "/". Since the above function is used often, this is a separte
 710    function, instead of an if.
 711
 712 *******************************************************************************/
 713
 714 utf *utf_new_char_classname(const char *text)
 715 {
 716         if (strchr(text, '.')) {
 717                 char *txt = strdup(text);
 718                 char *end = txt + strlen(txt);
 719                 char *c;
 720                 utf *tmpRes;
 721
 722                 for (c = txt; c < end; c++)
 723                         if (*c == '.') *c = '/';
 724
 725                 tmpRes = utf_new(txt, strlen(txt));
 726                 FREE(txt, 0);
 727
 728                 return tmpRes;
 729
 730         } else
 731                 return utf_new(text, strlen(text));
 732 }
 733
 734
 735 /* utf_nextu2 ******************************************************************
 736
 737    Read the next unicode character from the utf string and increment
 738    the utf-string pointer accordingly.
 739
 740    CAUTION: This function is unsafe for input that was not checked
 741             by is_valid_utf!
 742
 743 *******************************************************************************/
 744
 745 u2 utf_nextu2(char **utf_ptr)
 746 {
 747     /* uncompressed unicode character */
 748     u2 unicode_char = 0;
 749     /* current position in utf text */
 750     unsigned char *utf = (unsigned char *) (*utf_ptr);
 751     /* bytes representing the unicode character */
 752     unsigned char ch1, ch2, ch3;
 753     /* number of bytes used to represent the unicode character */
 754     int len = 0;
 755
 756     switch ((ch1 = utf[0]) >> 4) {
 757         default: /* 1 byte */
 758                 (*utf_ptr)++;
 759                 return (u2) ch1;
 760         case 0xC:
 761         case 0xD: /* 2 bytes */
 762                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 763                         unsigned char high = ch1 & 0x1F;
 764                         unsigned char low  = ch2 & 0x3F;
 765                         unicode_char = (high << 6) + low;
 766                         len = 2;
 767                 }
 768                 break;
 769
 770         case 0xE: /* 2 or 3 bytes */
 771                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 772                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 773                                 unsigned char low  = ch3 & 0x3f;
 774                                 unsigned char mid  = ch2 & 0x3f;
 775                                 unsigned char high = ch1 & 0x0f;
 776                                 unicode_char = (((high << 6) + mid) << 6) + low;
 777                                 len = 3;
 778                         } else
 779                                 len = 2;
 780                 }
 781                 break;
 782     }
 783
 784     /* update position in utf-text */
 785     *utf_ptr = (char *) (utf + len);
 786
 787     return unicode_char;
 788 }
 789
 790
 791 /* utf_bytes *******************************************************************
 792
 793    Determine number of bytes (aka. octets) in the utf string.
 794
 795    IN:
 796       u............utf string
 797
 798    OUT:
 799       The number of octets of this utf string.
 800           There is _no_ terminating zero included in this count.
 801
 802 *******************************************************************************/
 803
 804 u4 utf_bytes(utf *u)
 805 {
 806         return u->blength;
 807 }
 808
 809 /* utf_get_number_of_u2s_for_buffer ********************************************
 810
 811    Determine number of UTF-16 u2s in the given UTF-8 buffer
 812
 813    CAUTION: This function is unsafe for input that was not checked
 814             by is_valid_utf!
 815
 816    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
 817    to an array of u2s (UTF-16) and want to know how many of them you will get.
 818    All other uses of this function are probably wrong.
 819
 820    IN:
 821       buffer........points to first char in buffer
 822           blength.......number of _bytes_ in the buffer
 823
 824    OUT:
 825       the number of u2s needed to hold this string in UTF-16 encoding.
 826           There is _no_ terminating zero included in this count.
 827
 828    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
 829    exception.
 830
 831 *******************************************************************************/
 832
 833 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
 834 {
 835         const char *endpos;                 /* points behind utf string           */
 836         const char *utf_ptr;                /* current position in utf text       */
 837         u4 len = 0;                         /* number of unicode characters       */
 838
 839         utf_ptr = buffer;
 840         endpos = utf_ptr + blength;
 841
 842         while (utf_ptr < endpos) {
 843                 len++;
 844                 /* next unicode character */
 845                 utf_nextu2((char **)&utf_ptr);
 846         }
 847
 848         assert(utf_ptr == endpos);
 849
 850         return len;
 851 }
 852
 853
 854 /* utf_get_number_of_u2s *******************************************************
 855
 856    Determine number of UTF-16 u2s in the utf string.
 857
 858    CAUTION: This function is unsafe for input that was not checked
 859             by is_valid_utf!
 860
 861    CAUTION: Use this function *only* when you want to convert a utf string
 862    to an array of u2s and want to know how many of them you will get.
 863    All other uses of this function are probably wrong.
 864
 865    IN:
 866       u............utf string
 867
 868    OUT:
 869       the number of u2s needed to hold this string in UTF-16 encoding.
 870           There is _no_ terminating zero included in this count.
 871           XXX 0 if a NullPointerException has been thrown (see below)
 872
 873 *******************************************************************************/
 874
 875 u4 utf_get_number_of_u2s(utf *u)
 876 {
 877         char *endpos;                       /* points behind utf string           */
 878         char *utf_ptr;                      /* current position in utf text       */
 879         u4 len = 0;                         /* number of unicode characters       */
 880
 881         /* XXX this is probably not checked by most callers! Review this after */
 882         /* the invalid uses of this function have been eliminated */
 883         if (!u) {
 884                 exceptions_throw_nullpointerexception();
 885                 return 0;
 886         }
 887
 888         endpos = UTF_END(u);
 889         utf_ptr = u->text;
 890
 891         while (utf_ptr < endpos) {
 892                 len++;
 893                 /* next unicode character */
 894                 utf_nextu2(&utf_ptr);
 895         }
 896
 897         if (utf_ptr != endpos)
 898                 /* string ended abruptly */
 899                 throw_cacao_exception_exit(string_java_lang_InternalError,
 900                                                                    "Illegal utf8 string");
 901
 902         return len;
 903 }
 904
 905
 906 /* utf8_safe_number_of_u2s *****************************************************
 907
 908    Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
 909    (For invalid UTF-8 the U+fffd replacement character will be counted.)
 910
 911    This function is safe even for invalid UTF-8 strings.
 912
 913    IN:
 914       text..........zero-terminated UTF-8 string (may be invalid)
 915                         must NOT be NULL
 916
 917    OUT:
 918       the number of u2s needed to hold this string in UTF-16 encoding.
 919           There is _no_ terminating zero included in this count.
 920
 921 *******************************************************************************/
 922
 923 s4 utf8_safe_number_of_u2s(const char *text) {
 924         register const unsigned char *t;
 925         register s4 byte;
 926         register s4 len;
 927         s4 byte1;
 928         s4 byte2;
 929         s4 byte3;
 930         s4 value;
 931         s4 skip;
 932
 933         assert(text);
 934
 935         len = 0;
 936         t = (const unsigned char *) text;
 937
 938         /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
 939
 940         while (1) {
 941                 byte = *t++;
 942
 943                 if (byte & 0x80) {
 944                         /* highest bit set, non-ASCII character */
 945
 946                         if ((byte & 0xe0) == 0xc0) {
 947                                 /* 2-byte: should be 110..... 10...... ? */
 948
 949                                 if ((*t++ & 0xc0) == 0x80)
 950                                         ; /* valid 2-byte */
 951                                 else
 952                                         t--; /* invalid */
 953                         }
 954                         else if ((byte & 0xf0) == 0xe0) {
 955                                 /* 3-byte: should be 1110.... 10...... 10...... */
 956
 957                                 if ((*t++ & 0xc0) == 0x80) {
 958                                         if ((*t++ & 0xc0) == 0x80)
 959                                                 ; /* valid 3-byte */
 960                                         else
 961                                                 t--; /* invalid */
 962                                 }
 963                                 else
 964                                         t--; /* invalid */
 965                         }
 966                         else if ((byte & 0xf8) == 0xf0) {
 967                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
 968
 969                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
 970                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
 971                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
 972                                                         /* valid 4-byte UTF-8? */
 973                                                         value = ((byte  & 0x07) << 18)
 974                                                                   | ((byte1 & 0x3f) << 12)
 975                                                                   | ((byte2 & 0x3f) <<  6)
 976                                                                   | ((byte3 & 0x3f)      );
 977
 978                                                         if (value > 0x10FFFF)
 979                                                                 ; /* invalid */
 980                                                         else if (value > 0xFFFF)
 981                                                                 len += 1; /* we need surrogates */
 982                                                         else
 983                                                                 ; /* 16bit suffice */
 984                                                 }
 985                                                 else
 986                                                         t--; /* invalid */
 987                                         }
 988                                         else
 989                                                 t--; /* invalid */
 990                                 }
 991                                 else
 992                                         t--; /* invalid */
 993                         }
 994                         else if ((byte & 0xfc) == 0xf8) {
 995                                 /* invalid 5-byte */
 996                                 skip = 4;
 997                                 for (; skip && (*t & 0x80); --skip)
 998                                         t++;
 999                         }
1000                         else if ((byte & 0xfe) == 0xfc) {
1001                                 /* invalid 6-byte */
1002                                 skip = 5;
1003                                 for (; skip && (*t & 0x80); --skip)
1004                                         t++;
1005                         }
1006                         else
1007                                 ; /* invalid */
1008                 }
1009                 else {
1010                         /* NUL */
1011
1012                         if (byte == 0)
1013                                 break;
1014
1015                         /* ASCII character, common case */
1016                 }
1017
1018                 len++;
1019         }
1020
1021         return len;
1022 }
1023
1024
1025 /* utf8_safe_convert_to_u2s ****************************************************
1026
1027    Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
1028    (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
1029    Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
1030
1031    This function is safe even for invalid UTF-8 strings.
1032
1033    IN:
1034       text..........zero-terminated UTF-8 string (may be invalid)
1035                         must NOT be NULL
1036
1037 *******************************************************************************/
1038
1039 #define UNICODE_REPLACEMENT  0xfffd
1040
1041 void utf8_safe_convert_to_u2s(const char *text, u2 *buffer) {
1042         register const unsigned char *t;
1043         register s4 byte;
1044         s4 byte1;
1045         s4 byte2;
1046         s4 byte3;
1047         s4 value;
1048         s4 skip;
1049
1050         assert(text);
1051
1052         t = (const unsigned char *) text;
1053
1054         /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
1055
1056         while (1) {
1057                 byte = *t++;
1058
1059                 if (byte & 0x80) {
1060                         /* highest bit set, non-ASCII character */
1061
1062                         if ((byte & 0xe0) == 0xc0) {
1063                                 /* 2-byte: should be 110..... 10...... */
1064
1065                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1066                                         /* valid 2-byte UTF-8 */
1067                                         *buffer++ = ((byte  & 0x1f) << 6)
1068                                                           | ((byte1 & 0x3f)     );
1069                                 }
1070                                 else {
1071                                         *buffer++ = UNICODE_REPLACEMENT;
1072                                         t--;
1073                                 }
1074                         }
1075                         else if ((byte & 0xf0) == 0xe0) {
1076                                 /* 3-byte: should be 1110.... 10...... 10...... */
1077
1078                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1079                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1080                                                 /* valid 3-byte UTF-8 */
1081                                                 *buffer++ = ((byte  & 0x0f) << 12)
1082                                                                   | ((byte1 & 0x3f) <<  6)
1083                                                                   | ((byte2 & 0x3f)      );
1084                                         }
1085                                         else {
1086                                                 *buffer++ = UNICODE_REPLACEMENT;
1087                                                 t--;
1088                                         }
1089                                 }
1090                                 else {
1091                                         *buffer++ = UNICODE_REPLACEMENT;
1092                                         t--;
1093                                 }
1094                         }
1095                         else if ((byte & 0xf8) == 0xf0) {
1096                                 /* 4-byte: should be 11110... 10...... 10...... 10...... */
1097
1098                                 if (((byte1 = *t++) & 0xc0) == 0x80) {
1099                                         if (((byte2 = *t++) & 0xc0) == 0x80) {
1100                                                 if (((byte3 = *t++) & 0xc0) == 0x80) {
1101                                                         /* valid 4-byte UTF-8? */
1102                                                         value = ((byte  & 0x07) << 18)
1103                                                                   | ((byte1 & 0x3f) << 12)
1104                                                                   | ((byte2 & 0x3f) <<  6)
1105                                                                   | ((byte3 & 0x3f)      );
1106
1107                                                         if (value > 0x10FFFF) {
1108                                                                 *buffer++ = UNICODE_REPLACEMENT;
1109                                                         }
1110                                                         else if (value > 0xFFFF) {
1111                                                                 /* we need surrogates */
1112                                                                 *buffer++ = 0xd800 | ((value >> 10) - 0x40);
1113                                                                 *buffer++ = 0xdc00 | (value & 0x03ff);
1114                                                         }
1115                                                         else
1116                                                                 *buffer++ = value; /* 16bit suffice */
1117                                                 }
1118                                                 else {
1119                                                         *buffer++ = UNICODE_REPLACEMENT;
1120                                                         t--;
1121                                                 }
1122                                         }
1123                                         else {
1124                                                 *buffer++ = UNICODE_REPLACEMENT;
1125                                                 t--;
1126                                         }
1127                                 }
1128                                 else {
1129                                         *buffer++ = UNICODE_REPLACEMENT;
1130                                         t--;
1131                                 }
1132                         }
1133                         else if ((byte & 0xfc) == 0xf8) {
1134                                 skip = 4;
1135                                 for (; skip && (*t & 0x80); --skip)
1136                                         t++;
1137                                 *buffer++ = UNICODE_REPLACEMENT;
1138                         }
1139                         else if ((byte & 0xfe) == 0xfc) {
1140                                 skip = 5;
1141                                 for (; skip && (*t & 0x80); --skip)
1142                                         t++;
1143                                 *buffer++ = UNICODE_REPLACEMENT;
1144                         }
1145                         else
1146                                 *buffer++ = UNICODE_REPLACEMENT;
1147                 }
1148                 else {
1149                         /* NUL */
1150
1151                         if (byte == 0)
1152                                 break;
1153
1154                         /* ASCII character, common case */
1155
1156                         *buffer++ = byte;
1157                 }
1158         }
1159 }
1160
1161
1162 /* u2_utflength ****************************************************************
1163
1164    Returns the utf length in bytes of a u2 array.
1165
1166 *******************************************************************************/
1167
1168 u4 u2_utflength(u2 *text, u4 u2_length)
1169 {
1170         u4 result_len = 0;                  /* utf length in bytes                */
1171         u2 ch;                              /* current unicode character          */
1172         u4 len;
1173
1174         for (len = 0; len < u2_length; len++) {
1175                 /* next unicode character */
1176                 ch = *text++;
1177
1178                 /* determine bytes required to store unicode character as utf */
1179                 if (ch && (ch < 0x80))
1180                         result_len++;
1181                 else if (ch < 0x800)
1182                         result_len += 2;
1183                 else
1184                         result_len += 3;
1185         }
1186
1187     return result_len;
1188 }
1189
1190
1191 /* utf_copy ********************************************************************
1192
1193    Copy the given utf string byte-for-byte to a buffer.
1194
1195    IN:
1196       buffer.......the buffer
1197           u............the utf string
1198
1199 *******************************************************************************/
1200
1201 void utf_copy(char *buffer, utf *u)
1202 {
1203         /* our utf strings are zero-terminated (done by utf_new) */
1204         MCOPY(buffer, u->text, char, u->blength + 1);
1205 }
1206
1207
1208 /* utf_cat *********************************************************************
1209
1210    Append the given utf string byte-for-byte to a buffer.
1211
1212    IN:
1213       buffer.......the buffer
1214           u............the utf string
1215
1216 *******************************************************************************/
1217
1218 void utf_cat(char *buffer, utf *u)
1219 {
1220         /* our utf strings are zero-terminated (done by utf_new) */
1221         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
1222 }
1223
1224
1225 /* utf_copy_classname **********************************************************
1226
1227    Copy the given utf classname byte-for-byte to a buffer.
1228    '/' is replaced by '.'
1229
1230    IN:
1231       buffer.......the buffer
1232           u............the utf string
1233
1234 *******************************************************************************/
1235
1236 void utf_copy_classname(char *buffer, utf *u)
1237 {
1238         char *bufptr;
1239         char *srcptr;
1240         char *endptr;
1241         char ch;
1242
1243         bufptr = buffer;
1244         srcptr = u->text;
1245         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
1246
1247         while (srcptr != endptr) {
1248                 ch = *srcptr++;
1249                 if (ch == '/')
1250                         ch = '.';
1251                 *bufptr++ = ch;
1252         }
1253 }
1254
1255
1256 /* utf_cat *********************************************************************
1257
1258    Append the given utf classname byte-for-byte to a buffer.
1259    '/' is replaced by '.'
1260
1261    IN:
1262       buffer.......the buffer
1263           u............the utf string
1264
1265 *******************************************************************************/
1266
1267 void utf_cat_classname(char *buffer, utf *u)
1268 {
1269         utf_copy_classname(buffer + strlen(buffer), u);
1270 }
1271
1272 /* utf_display_printable_ascii *************************************************
1273
1274    Write utf symbol to stdout (for debugging purposes).
1275    Non-printable and non-ASCII characters are printed as '?'.
1276
1277 *******************************************************************************/
1278
1279 void utf_display_printable_ascii(utf *u)
1280 {
1281         char *endpos;                       /* points behind utf string           */
1282         char *utf_ptr;                      /* current position in utf text       */
1283
1284         if (u == NULL) {
1285                 printf("NULL");
1286                 fflush(stdout);
1287                 return;
1288         }
1289
1290         endpos = UTF_END(u);
1291         utf_ptr = u->text;
1292
1293         while (utf_ptr < endpos) {
1294                 /* read next unicode character */
1295
1296                 u2 c = utf_nextu2(&utf_ptr);
1297
1298                 if ((c >= 32) && (c <= 127))
1299                         printf("%c", c);
1300                 else
1301                         printf("?");
1302         }
1303
1304         fflush(stdout);
1305 }
1306
1307
1308 /* utf_display_printable_ascii_classname ***************************************
1309
1310    Write utf symbol to stdout with `/' converted to `.' (for debugging
1311    purposes).
1312    Non-printable and non-ASCII characters are printed as '?'.
1313
1314 *******************************************************************************/
1315
1316 void utf_display_printable_ascii_classname(utf *u)
1317 {
1318         char *endpos;                       /* points behind utf string           */
1319         char *utf_ptr;                      /* current position in utf text       */
1320
1321         if (u == NULL) {
1322                 printf("NULL");
1323                 fflush(stdout);
1324                 return;
1325         }
1326
1327         endpos = UTF_END(u);
1328         utf_ptr = u->text;
1329
1330         while (utf_ptr < endpos) {
1331                 /* read next unicode character */
1332
1333                 u2 c = utf_nextu2(&utf_ptr);
1334
1335                 if (c == '/')
1336                         c = '.';
1337
1338                 if ((c >= 32) && (c <= 127))
1339                         printf("%c", c);
1340                 else
1341                         printf("?");
1342         }
1343
1344         fflush(stdout);
1345 }
1346
1347
1348 /* utf_sprint_convert_to_latin1 ************************************************
1349
1350    Write utf symbol into c-string (for debugging purposes).
1351    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1352    invalid results.
1353
1354 *******************************************************************************/
1355
1356 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1357 {
1358         char *endpos;                       /* points behind utf string           */
1359         char *utf_ptr;                      /* current position in utf text       */
1360         u2 pos = 0;                         /* position in c-string               */
1361
1362         if (!u) {
1363                 strcpy(buffer, "NULL");
1364                 return;
1365         }
1366
1367         endpos = UTF_END(u);
1368         utf_ptr = u->text;
1369
1370         while (utf_ptr < endpos)
1371                 /* copy next unicode character */
1372                 buffer[pos++] = utf_nextu2(&utf_ptr);
1373
1374         /* terminate string */
1375         buffer[pos] = '\0';
1376 }
1377
1378
1379 /* utf_sprint_convert_to_latin1_classname **************************************
1380
1381    Write utf symbol into c-string with `/' converted to `.' (for debugging
1382    purposes).
1383    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1384    invalid results.
1385
1386 *******************************************************************************/
1387
1388 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1389 {
1390         char *endpos;                       /* points behind utf string           */
1391         char *utf_ptr;                      /* current position in utf text       */
1392         u2 pos = 0;                         /* position in c-string               */
1393
1394         if (!u) {
1395                 strcpy(buffer, "NULL");
1396                 return;
1397         }
1398
1399         endpos = UTF_END(u);
1400         utf_ptr = u->text;
1401
1402         while (utf_ptr < endpos) {
1403                 /* copy next unicode character */
1404                 u2 c = utf_nextu2(&utf_ptr);
1405                 if (c == '/') c = '.';
1406                 buffer[pos++] = c;
1407         }
1408
1409         /* terminate string */
1410         buffer[pos] = '\0';
1411 }
1412
1413
1414 /* utf_strcat_convert_to_latin1 ************************************************
1415
1416    Like libc strcat, but uses an utf8 string.
1417    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1418    invalid results.
1419
1420 *******************************************************************************/
1421
1422 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1423 {
1424         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1425 }
1426
1427
1428 /* utf_strcat_convert_to_latin1_classname **************************************
1429
1430    Like libc strcat, but uses an utf8 string.
1431    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1432    invalid results.
1433
1434 *******************************************************************************/
1435
1436 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1437 {
1438         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1439 }
1440
1441
1442 /* utf_fprint_printable_ascii **************************************************
1443
1444    Write utf symbol into file.
1445    Non-printable and non-ASCII characters are printed as '?'.
1446
1447 *******************************************************************************/
1448
1449 void utf_fprint_printable_ascii(FILE *file, utf *u)
1450 {
1451         char *endpos;                       /* points behind utf string           */
1452         char *utf_ptr;                      /* current position in utf text       */
1453
1454         if (!u)
1455                 return;
1456
1457         endpos = UTF_END(u);
1458         utf_ptr = u->text;
1459
1460         while (utf_ptr < endpos) {
1461                 /* read next unicode character */
1462                 u2 c = utf_nextu2(&utf_ptr);
1463
1464                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1465                 else fprintf(file, "?");
1466         }
1467 }
1468
1469
1470 /* utf_fprint_printable_ascii_classname ****************************************
1471
1472    Write utf symbol into file with `/' converted to `.'.
1473    Non-printable and non-ASCII characters are printed as '?'.
1474
1475 *******************************************************************************/
1476
1477 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1478 {
1479         char *endpos;                       /* points behind utf string           */
1480         char *utf_ptr;                      /* current position in utf text       */
1481
1482     if (!u)
1483                 return;
1484
1485         endpos = UTF_END(u);
1486         utf_ptr = u->text;
1487
1488         while (utf_ptr < endpos) {
1489                 /* read next unicode character */
1490                 u2 c = utf_nextu2(&utf_ptr);
1491                 if (c == '/') c = '.';
1492
1493                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1494                 else fprintf(file, "?");
1495         }
1496 }
1497
1498
1499 /* is_valid_utf ****************************************************************
1500
1501    Return true if the given string is a valid UTF-8 string.
1502
1503    utf_ptr...points to first character
1504    end_pos...points after last character
1505
1506 *******************************************************************************/
1507
1508 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1509
1510 bool is_valid_utf(char *utf_ptr, char *end_pos)
1511 {
1512         int bytes;
1513         int len,i;
1514         char c;
1515         unsigned long v;
1516
1517         if (end_pos < utf_ptr) return false;
1518         bytes = end_pos - utf_ptr;
1519         while (bytes--) {
1520                 c = *utf_ptr++;
1521
1522                 if (!c) return false;                     /* 0x00 is not allowed */
1523                 if ((c & 0x80) == 0) continue;            /* ASCII */
1524
1525                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1526                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1527                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1528                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1529                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1530                 else return false;                        /* invalid leading byte */
1531
1532                 if (len > 2) return false;                /* Java limitation */
1533
1534                 v = (unsigned long)c & (0x3f >> len);
1535
1536                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1537
1538                 for (i = len; i--; ) {
1539                         c = *utf_ptr++;
1540                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1541                                 return false;
1542                         v = (v << 6) | (c & 0x3f);
1543                 }
1544
1545                 if (v == 0) {
1546                         if (len != 1) return false;           /* Java special */
1547
1548                 } else {
1549                         /* Sun Java seems to allow overlong UTF-8 encodings */
1550
1551                         /* if (v < min_codepoint[len]) */
1552                                 /* XXX throw exception? */
1553                 }
1554
1555                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1556                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1557
1558                 /* even these seem to be allowed */
1559                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1560         }
1561
1562         return true;
1563 }
1564
1565
1566 /* is_valid_name ***************************************************************
1567
1568    Return true if the given string may be used as a class/field/method
1569    name. (Currently this only disallows empty strings and control
1570    characters.)
1571
1572    NOTE: The string is assumed to have passed is_valid_utf!
1573
1574    utf_ptr...points to first character
1575    end_pos...points after last character
1576
1577 *******************************************************************************/
1578
1579 bool is_valid_name(char *utf_ptr, char *end_pos)
1580 {
1581         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1582
1583         while (utf_ptr < end_pos) {
1584                 unsigned char c = *utf_ptr++;
1585
1586                 if (c < 0x20) return false; /* disallow control characters */
1587                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1588                         return false;
1589         }
1590
1591         return true;
1592 }
1593
1594 bool is_valid_name_utf(utf *u)
1595 {
1596         return is_valid_name(u->text, UTF_END(u));
1597 }
1598
1599
1600 /* utf_show ********************************************************************
1601
1602    Writes the utf symbols in the utfhash to stdout and displays the
1603    number of external hash chains grouped according to the chainlength
1604    (for debugging purposes).
1605
1606 *******************************************************************************/
1607
1608 #if !defined(NDEBUG)
1609 void utf_show(void)
1610 {
1611
1612 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1613
1614         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1615         u4 max_chainlength = 0;      /* maximum length of the chains */
1616         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1617         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1618         u4 i;
1619
1620         printf("UTF-HASH:\n");
1621
1622         /* show element of utf-hashtable */
1623
1624         for (i = 0; i < hashtable_utf->size; i++) {
1625                 utf *u = hashtable_utf->ptr[i];
1626
1627                 if (u) {
1628                         printf("SLOT %d: ", (int) i);
1629
1630                         while (u) {
1631                                 printf("'");
1632                                 utf_display_printable_ascii(u);
1633                                 printf("' ");
1634                                 u = u->hashlink;
1635                         }
1636                         printf("\n");
1637                 }
1638         }
1639
1640         printf("UTF-HASH: %d slots for %d entries\n",
1641                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1642
1643         if (hashtable_utf->entries == 0)
1644                 return;
1645
1646         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1647
1648         for (i=0;i<CHAIN_LIMIT;i++)
1649                 chain_count[i]=0;
1650
1651         /* count numbers of hashchains according to their length */
1652         for (i=0; i<hashtable_utf->size; i++) {
1653
1654                 utf *u = (utf*) hashtable_utf->ptr[i];
1655                 u4 chain_length = 0;
1656
1657                 /* determine chainlength */
1658                 while (u) {
1659                         u = u->hashlink;
1660                         chain_length++;
1661                 }
1662
1663                 /* update sum of all chainlengths */
1664                 sum_chainlength+=chain_length;
1665
1666                 /* determine the maximum length of the chains */
1667                 if (chain_length>max_chainlength)
1668                         max_chainlength = chain_length;
1669
1670                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1671                 if (chain_length>=CHAIN_LIMIT) {
1672                         beyond_limit+=chain_length;
1673                         chain_length=CHAIN_LIMIT-1;
1674                 }
1675
1676                 /* update number of hashchains of current length */
1677                 chain_count[chain_length]++;
1678         }
1679
1680         /* display results */
1681         for (i=1;i<CHAIN_LIMIT-1;i++)
1682                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1683
1684         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1685
1686
1687         printf("max. chainlength:%5d\n",max_chainlength);
1688
1689         /* avg. chainlength = sum of chainlengths / number of chains */
1690         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1691 }
1692 #endif /* !defined(NDEBUG) */
1693
1694
1695 /*
1696  * These are local overrides for various environment variables in Emacs.
1697  * Please do not remove this and leave it at the end of the file, where
1698  * Emacs will automagically detect them.
1699  * ---------------------------------------------------------------------
1700  * Local variables:
1701  * mode: c
1702  * indent-tabs-mode: t
1703  * c-basic-offset: 4
1704  * tab-width: 4
1705  * End:
1706  * vim:noexpandtab:sw=4:ts=4:
1707  */