src/vm/utf8.c

   1 /* src/vm/utf8.c - utf8 string functions
   2
   3    Copyright (C) 1996-2005, 2006 R. Grafl, A. Krall, C. Kruegel,
   4    C. Oates, R. Obermaisser, M. Platter, M. Probst, S. Ring,
   5    E. Steiner, C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich,
   6    J. Wenninger, Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  23    02110-1301, USA.
  24
  25    Contact: cacao@cacaojvm.org
  26
  27    Authors: Reinhard Grafl
  28
  29    Changes: Mark Probst
  30             Andreas Krall
  31             Christian Thalinger
  32                         Edwin Steiner
  33
  34    $Id: utf8.c 5053 2006-06-28 19:11:20Z twisti $
  35
  36 */
  37
  38
  39 #include "config.h"
  40
  41 #include <string.h>
  42 #include <assert.h>
  43
  44 #include "vm/types.h"
  45
  46 #include "mm/memory.h"
  47
  48 #if defined(ENABLE_THREADS)
  49 # include "threads/native/threads.h"
  50 #endif
  51
  52 #include "vm/builtin.h"
  53 #include "vm/exceptions.h"
  54 #include "vm/hashtable.h"
  55 #include "vm/options.h"
  56 #include "vm/statistics.h"
  57 #include "vm/stringlocal.h"
  58 #include "vm/utf8.h"
  59
  60
  61 /* global variables ***********************************************************/
  62
  63 /* hashsize must be power of 2 */
  64
  65 #define HASHTABLE_UTF_SIZE    16384     /* initial size of utf-hash           */
  66
  67 hashtable *hashtable_utf;               /* hashtable for utf8-symbols         */
  68
  69
  70 /* utf-symbols for pointer comparison of frequently used strings **************/
  71
  72 utf *utf_java_lang_Object;
  73
  74 utf *utf_java_lang_Class;
  75 utf *utf_java_lang_ClassLoader;
  76 utf *utf_java_lang_Cloneable;
  77 utf *utf_java_lang_SecurityManager;
  78 utf *utf_java_lang_String;
  79 utf *utf_java_lang_System;
  80 utf *utf_java_lang_ThreadGroup;
  81 utf *utf_java_io_Serializable;
  82
  83 utf *utf_java_lang_Throwable;
  84 utf *utf_java_lang_VMThrowable;
  85 utf *utf_java_lang_Error;
  86 utf *utf_java_lang_AbstractMethodError;
  87 utf *utf_java_lang_LinkageError;
  88 utf *utf_java_lang_NoClassDefFoundError;
  89 utf *utf_java_lang_NoSuchMethodError;
  90 utf *utf_java_lang_OutOfMemoryError;
  91
  92 utf *utf_java_lang_Exception;
  93 utf *utf_java_lang_ClassNotFoundException;
  94 utf *utf_java_lang_IllegalArgumentException;
  95 utf *utf_java_lang_IllegalMonitorStateException;
  96
  97 utf *utf_java_lang_NullPointerException;
  98
  99 utf* utf_java_lang_Void;
 100 utf* utf_java_lang_Boolean;
 101 utf* utf_java_lang_Byte;
 102 utf* utf_java_lang_Character;
 103 utf* utf_java_lang_Short;
 104 utf* utf_java_lang_Integer;
 105 utf* utf_java_lang_Long;
 106 utf* utf_java_lang_Float;
 107 utf* utf_java_lang_Double;
 108
 109 utf *utf_java_lang_StackTraceElement;
 110 utf *utf_java_lang_reflect_Constructor;
 111 utf *utf_java_lang_reflect_Field;
 112 utf *utf_java_lang_reflect_Method;
 113 utf *utf_java_util_Vector;
 114
 115 utf *utf_InnerClasses;                  /* InnerClasses                       */
 116 utf *utf_ConstantValue;                 /* ConstantValue                      */
 117 utf *utf_Code;                          /* Code                               */
 118 utf *utf_Exceptions;                    /* Exceptions                         */
 119 utf *utf_LineNumberTable;               /* LineNumberTable                    */
 120 utf *utf_SourceFile;                    /* SourceFile                         */
 121
 122 utf *utf_init;                          /* <init>                             */
 123 utf *utf_clinit;                        /* <clinit>                           */
 124 utf *utf_clone;                         /* clone                              */
 125 utf *utf_finalize;                      /* finalize                           */
 126 utf *utf_run;                           /* run                                */
 127
 128 utf *utf_add;                           /* add                                */
 129 utf *utf_remove;                        /* remove                             */
 130 utf *utf_put;                           /* put                                */
 131 utf *utf_get;                           /* get                                */
 132 utf *utf_value;                         /* value                              */
 133
 134 utf *utf_fillInStackTrace;
 135 utf *utf_getSystemClassLoader;
 136 utf *utf_loadClass;
 137 utf *utf_printStackTrace;
 138
 139 utf *utf_Z;                             /* Z                                  */
 140 utf *utf_B;                             /* B                                  */
 141 utf *utf_C;                             /* C                                  */
 142 utf *utf_S;                             /* S                                  */
 143 utf *utf_I;                             /* I                                  */
 144 utf *utf_J;                             /* J                                  */
 145 utf *utf_F;                             /* F                                  */
 146 utf *utf_D;                             /* D                                  */
 147
 148 utf *utf_void__void;                    /* ()V                                */
 149 utf *utf_boolean__void;                 /* (Z)V                               */
 150 utf *utf_byte__void;                    /* (B)V                               */
 151 utf *utf_char__void;                    /* (C)V                               */
 152 utf *utf_short__void;                   /* (S)V                               */
 153 utf *utf_int__void;                     /* (I)V                               */
 154 utf *utf_long__void;                    /* (J)V                               */
 155 utf *utf_float__void;                   /* (F)V                               */
 156 utf *utf_double__void;                  /* (D)V                               */
 157
 158 utf *utf_void__java_lang_ClassLoader;   /* ()Ljava/lang/ClassLoader;          */
 159 utf *utf_void__java_lang_Object;        /* ()Ljava/lang/Object;               */
 160 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 161 utf *utf_java_lang_Object__java_lang_Object;
 162 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 163 utf *utf_java_lang_String__java_lang_Class;
 164 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 165
 166 utf *utf_not_named_yet;                 /* special name for unnamed classes   */
 167 utf *utf_null;
 168 utf *array_packagename;
 169
 170
 171 /* utf_init ********************************************************************
 172
 173    Initializes the utf8 subsystem.
 174
 175 *******************************************************************************/
 176
 177 bool utf8_init(void)
 178 {
 179         /* create utf8 hashtable */
 180
 181         hashtable_utf = NEW(hashtable);
 182
 183         hashtable_create(hashtable_utf, HASHTABLE_UTF_SIZE);
 184
 185 #if defined(ENABLE_STATISTICS)
 186         if (opt_stat)
 187                 count_utf_len += sizeof(utf*) * hashtable_utf->size;
 188 #endif
 189
 190         /* create utf-symbols for pointer comparison of frequently used strings */
 191
 192         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 193
 194         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 195         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 196         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 197         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 198         utf_java_lang_String           = utf_new_char("java/lang/String");
 199         utf_java_lang_System           = utf_new_char("java/lang/System");
 200         utf_java_lang_ThreadGroup      = utf_new_char("java/lang/ThreadGroup");
 201         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 202
 203         utf_java_lang_Throwable        = utf_new_char(string_java_lang_Throwable);
 204         utf_java_lang_VMThrowable      = utf_new_char(string_java_lang_VMThrowable);
 205         utf_java_lang_Error            = utf_new_char(string_java_lang_Error);
 206
 207         utf_java_lang_AbstractMethodError =
 208                 utf_new_char(string_java_lang_AbstractMethodError);
 209
 210         utf_java_lang_LinkageError =
 211                 utf_new_char(string_java_lang_LinkageError);
 212
 213         utf_java_lang_NoClassDefFoundError =
 214                 utf_new_char(string_java_lang_NoClassDefFoundError);
 215
 216         utf_java_lang_NoSuchMethodError =
 217                 utf_new_char(string_java_lang_NoSuchMethodError);
 218
 219         utf_java_lang_OutOfMemoryError =
 220                 utf_new_char(string_java_lang_OutOfMemoryError);
 221
 222         utf_java_lang_Exception        = utf_new_char(string_java_lang_Exception);
 223
 224         utf_java_lang_ClassNotFoundException =
 225                 utf_new_char(string_java_lang_ClassNotFoundException);
 226
 227         utf_java_lang_IllegalArgumentException =
 228                 utf_new_char(string_java_lang_IllegalArgumentException);
 229
 230         utf_java_lang_IllegalMonitorStateException =
 231                 utf_new_char(string_java_lang_IllegalMonitorStateException);
 232
 233         utf_java_lang_NullPointerException =
 234                 utf_new_char(string_java_lang_NullPointerException);
 235
 236         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 237         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 238         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 239         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 240         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 241         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 242         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 243         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 244         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 245
 246         utf_java_lang_StackTraceElement =
 247                 utf_new_char("java/lang/StackTraceElement");
 248
 249         utf_java_lang_reflect_Constructor =
 250                 utf_new_char("java/lang/reflect/Constructor");
 251
 252         utf_java_lang_reflect_Field    = utf_new_char("java/lang/reflect/Field");
 253         utf_java_lang_reflect_Method   = utf_new_char("java/lang/reflect/Method");
 254         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 255
 256         utf_InnerClasses               = utf_new_char("InnerClasses");
 257         utf_ConstantValue              = utf_new_char("ConstantValue");
 258         utf_Code                       = utf_new_char("Code");
 259         utf_Exceptions                 = utf_new_char("Exceptions");
 260         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 261         utf_SourceFile                 = utf_new_char("SourceFile");
 262
 263         utf_init                           = utf_new_char("<init>");
 264         utf_clinit                         = utf_new_char("<clinit>");
 265         utf_clone                      = utf_new_char("clone");
 266         utf_finalize                   = utf_new_char("finalize");
 267         utf_run                        = utf_new_char("run");
 268
 269         utf_add                        = utf_new_char("add");
 270         utf_remove                     = utf_new_char("remove");
 271         utf_put                        = utf_new_char("put");
 272         utf_get                        = utf_new_char("get");
 273         utf_value                      = utf_new_char("value");
 274
 275         utf_printStackTrace            = utf_new_char("printStackTrace");
 276         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 277         utf_loadClass                  = utf_new_char("loadClass");
 278         utf_getSystemClassLoader       = utf_new_char("getSystemClassLoader");
 279
 280         utf_Z                          = utf_new_char("Z");
 281         utf_B                          = utf_new_char("B");
 282         utf_C                          = utf_new_char("C");
 283         utf_S                          = utf_new_char("S");
 284         utf_I                          = utf_new_char("I");
 285         utf_J                          = utf_new_char("J");
 286         utf_F                          = utf_new_char("F");
 287         utf_D                          = utf_new_char("D");
 288
 289         utf_void__void                 = utf_new_char("()V");
 290         utf_boolean__void              = utf_new_char("(Z)V");
 291         utf_byte__void                 = utf_new_char("(B)V");
 292         utf_char__void                 = utf_new_char("(C)V");
 293         utf_short__void                = utf_new_char("(S)V");
 294         utf_int__void                  = utf_new_char("(I)V");
 295         utf_long__void                 = utf_new_char("(J)V");
 296         utf_float__void                = utf_new_char("(F)V");
 297         utf_double__void               = utf_new_char("(D)V");
 298         utf_void__java_lang_Object     = utf_new_char("()Ljava/lang/Object;");
 299         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 300
 301         utf_void__java_lang_ClassLoader =
 302                 utf_new_char("()Ljava/lang/ClassLoader;");
 303
 304         utf_java_lang_Object__java_lang_Object =
 305                 utf_new_char("(Ljava/lang/Object;)Ljava/lang/Object;");
 306
 307         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 308
 309         utf_java_lang_String__java_lang_Class =
 310                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 311
 312         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 313
 314         utf_null                       = utf_new_char("null");
 315         utf_not_named_yet              = utf_new_char("\t<not_named_yet>");
 316         array_packagename              = utf_new_char("\t<the array package>");
 317
 318         /* everything's ok */
 319
 320         return true;
 321 }
 322
 323
 324 /* utf_hashkey *****************************************************************
 325
 326    The hashkey is computed from the utf-text by using up to 8
 327    characters.  For utf-symbols longer than 15 characters 3 characters
 328    are taken from the beginning and the end, 2 characters are taken
 329    from the middle.
 330
 331 *******************************************************************************/
 332
 333 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 334 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 335
 336 u4 utf_hashkey(const char *text, u4 length)
 337 {
 338         const char *start_pos = text;       /* pointer to utf text                */
 339         u4 a;
 340
 341         switch (length) {
 342         case 0: /* empty string */
 343                 return 0;
 344
 345         case 1: return fbs(0);
 346         case 2: return fbs(0) ^ nbs(3);
 347         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 348         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 349         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 350         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 351         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 352         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 353
 354         case 9:
 355                 a = fbs(0);
 356                 a ^= nbs(1);
 357                 a ^= nbs(2);
 358                 text++;
 359                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 360
 361         case 10:
 362                 a = fbs(0);
 363                 text++;
 364                 a ^= nbs(2);
 365                 a ^= nbs(3);
 366                 a ^= nbs(4);
 367                 text++;
 368                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 369
 370         case 11:
 371                 a = fbs(0);
 372                 text++;
 373                 a ^= nbs(2);
 374                 a ^= nbs(3);
 375                 a ^= nbs(4);
 376                 text++;
 377                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 378
 379         case 12:
 380                 a = fbs(0);
 381                 text += 2;
 382                 a ^= nbs(2);
 383                 a ^= nbs(3);
 384                 text++;
 385                 a ^= nbs(5);
 386                 a ^= nbs(6);
 387                 a ^= nbs(7);
 388                 text++;
 389                 return a ^ nbs(9) ^ nbs(10);
 390
 391         case 13:
 392                 a = fbs(0);
 393                 a ^= nbs(1);
 394                 text++;
 395                 a ^= nbs(3);
 396                 a ^= nbs(4);
 397                 text += 2;
 398                 a ^= nbs(7);
 399                 a ^= nbs(8);
 400                 text += 2;
 401                 return a ^ nbs(9) ^ nbs(10);
 402
 403         case 14:
 404                 a = fbs(0);
 405                 text += 2;
 406                 a ^= nbs(3);
 407                 a ^= nbs(4);
 408                 text += 2;
 409                 a ^= nbs(7);
 410                 a ^= nbs(8);
 411                 text += 2;
 412                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 413
 414         case 15:
 415                 a = fbs(0);
 416                 text += 2;
 417                 a ^= nbs(3);
 418                 a ^= nbs(4);
 419                 text += 2;
 420                 a ^= nbs(7);
 421                 a ^= nbs(8);
 422                 text += 2;
 423                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 424
 425         default:  /* 3 characters from beginning */
 426                 a = fbs(0);
 427                 text += 2;
 428                 a ^= nbs(3);
 429                 a ^= nbs(4);
 430
 431                 /* 2 characters from middle */
 432                 text = start_pos + (length / 2);
 433                 a ^= fbs(5);
 434                 text += 2;
 435                 a ^= nbs(6);
 436
 437                 /* 3 characters from end */
 438                 text = start_pos + length - 4;
 439
 440                 a ^= fbs(7);
 441                 text++;
 442
 443                 return a ^ nbs(10) ^ nbs(11);
 444     }
 445 }
 446
 447 /* utf_full_hashkey ************************************************************
 448
 449    This function computes a hash value using all bytes in the string.
 450
 451    The algorithm is the "One-at-a-time" algorithm as published
 452    by Bob Jenkins on http://burtleburtle.net/bob/hash/doobs.html.
 453
 454 *******************************************************************************/
 455
 456 u4 utf_full_hashkey(const char *text, u4 length)
 457 {
 458         register const unsigned char *p = (const unsigned char *) text;
 459         register u4 hash;
 460         register u4 i;
 461
 462         hash = 0;
 463         for (i=length; i--;)
 464         {
 465             hash += *p++;
 466             hash += (hash << 10);
 467             hash ^= (hash >> 6);
 468         }
 469         hash += (hash << 3);
 470         hash ^= (hash >> 11);
 471         hash += (hash << 15);
 472
 473         return hash;
 474 }
 475
 476 /* unicode_hashkey *************************************************************
 477
 478    Compute the hashkey of a unicode string.
 479
 480 *******************************************************************************/
 481
 482 u4 unicode_hashkey(u2 *text, u2 len)
 483 {
 484         return utf_hashkey((char *) text, len);
 485 }
 486
 487
 488 /* utf_new *********************************************************************
 489
 490    Creates a new utf-symbol, the text of the symbol is passed as a
 491    u1-array. The function searches the utf-hashtable for a utf-symbol
 492    with this text. On success the element returned, otherwise a new
 493    hashtable element is created.
 494
 495    If the number of entries in the hashtable exceeds twice the size of
 496    the hashtable slots a reorganization of the hashtable is done and
 497    the utf symbols are copied to a new hashtable with doubled size.
 498
 499 *******************************************************************************/
 500
 501 utf *utf_new(const char *text, u2 length)
 502 {
 503         u4 key;                             /* hashkey computed from utf-text     */
 504         u4 slot;                            /* slot in hashtable                  */
 505         utf *u;                             /* hashtable element                  */
 506         u2 i;
 507
 508 #if defined(ENABLE_THREADS)
 509         builtin_monitorenter(hashtable_utf->header);
 510 #endif
 511
 512 #if defined(ENABLE_STATISTICS)
 513         if (opt_stat)
 514                 count_utf_new++;
 515 #endif
 516
 517         key  = utf_hashkey(text, length);
 518         slot = key & (hashtable_utf->size - 1);
 519         u    = hashtable_utf->ptr[slot];
 520
 521         /* search external hash chain for utf-symbol */
 522
 523         while (u) {
 524                 if (u->blength == length) {
 525                         /* compare text of hashtable elements */
 526
 527                         for (i = 0; i < length; i++)
 528                                 if (text[i] != u->text[i])
 529                                         goto nomatch;
 530
 531 #if defined(ENABLE_STATISTICS)
 532                         if (opt_stat)
 533                                 count_utf_new_found++;
 534 #endif
 535
 536                         /* symbol found in hashtable */
 537
 538 #if defined(ENABLE_THREADS)
 539                         builtin_monitorexit(hashtable_utf->header);
 540 #endif
 541
 542                         return u;
 543                 }
 544
 545         nomatch:
 546                 u = u->hashlink; /* next element in external chain */
 547         }
 548
 549 #if defined(ENABLE_STATISTICS)
 550         if (opt_stat)
 551                 count_utf_len += sizeof(utf) + length + 1;
 552 #endif
 553
 554         /* location in hashtable found, create new utf element */
 555         u = NEW(utf);
 556         u->blength  = length;               /* length in bytes of utfstring       */
 557         u->hashlink = hashtable_utf->ptr[slot]; /* link in external hashchain     */
 558         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 559
 560         memcpy(u->text, text, length);      /* copy utf-text                      */
 561         u->text[length] = '\0';
 562
 563         hashtable_utf->ptr[slot] = u;       /* insert symbol into table           */
 564         hashtable_utf->entries++;           /* update number of entries           */
 565
 566         if (hashtable_utf->entries > (hashtable_utf->size * 2)) {
 567
 568         /* reorganization of hashtable, average length of the external
 569            chains is approx. 2 */
 570
 571                 hashtable *newhash;                              /* the new hashtable */
 572                 u4         i;
 573                 utf       *u;
 574                 utf       *nextu;
 575                 u4         slot;
 576
 577                 /* create new hashtable, double the size */
 578
 579                 newhash = hashtable_resize(hashtable_utf, hashtable_utf->size * 2);
 580
 581 #if defined(ENABLE_STATISTICS)
 582                 if (opt_stat)
 583                         count_utf_len += sizeof(utf*) * hashtable_utf->size;
 584 #endif
 585
 586                 /* transfer elements to new hashtable */
 587
 588                 for (i = 0; i < hashtable_utf->size; i++) {
 589                         u = hashtable_utf->ptr[i];
 590
 591                         while (u) {
 592                                 nextu = u->hashlink;
 593                                 slot  = utf_hashkey(u->text, u->blength) & (newhash->size - 1);
 594
 595                                 u->hashlink = (utf *) newhash->ptr[slot];
 596                                 newhash->ptr[slot] = u;
 597
 598                                 /* follow link in external hash chain */
 599
 600                                 u = nextu;
 601                         }
 602                 }
 603
 604                 /* dispose old table */
 605
 606                 hashtable_free(hashtable_utf);
 607
 608                 hashtable_utf = newhash;
 609         }
 610
 611 #if defined(ENABLE_THREADS)
 612         builtin_monitorexit(hashtable_utf->header);
 613 #endif
 614
 615         return u;
 616 }
 617
 618
 619 /* utf_new_u2 ******************************************************************
 620
 621    Make utf symbol from u2 array, if isclassname is true '.' is
 622    replaced by '/'.
 623
 624 *******************************************************************************/
 625
 626 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 627 {
 628         char *buffer;                   /* memory buffer for  unicode characters  */
 629         char *pos;                      /* pointer to current position in buffer  */
 630         u4 left;                        /* unicode characters left                */
 631         u4 buflength;                   /* utf length in bytes of the u2 array    */
 632         utf *result;                    /* resulting utf-string                   */
 633         int i;
 634
 635         /* determine utf length in bytes and allocate memory */
 636
 637         buflength = u2_utflength(unicode_pos, unicode_length);
 638         buffer    = MNEW(char, buflength);
 639
 640         left = buflength;
 641         pos  = buffer;
 642
 643         for (i = 0; i++ < unicode_length; unicode_pos++) {
 644                 /* next unicode character */
 645                 u2 c = *unicode_pos;
 646
 647                 if ((c != 0) && (c < 0x80)) {
 648                         /* 1 character */
 649                         left--;
 650                 if ((int) left < 0) break;
 651                         /* convert classname */
 652                         if (isclassname && c == '.')
 653                                 *pos++ = '/';
 654                         else
 655                                 *pos++ = (char) c;
 656
 657                 } else if (c < 0x800) {
 658                         /* 2 characters */
 659                 unsigned char high = c >> 6;
 660                 unsigned char low  = c & 0x3F;
 661                         left = left - 2;
 662                 if ((int) left < 0) break;
 663                 *pos++ = high | 0xC0;
 664                 *pos++ = low  | 0x80;
 665
 666                 } else {
 667                 /* 3 characters */
 668                 char low  = c & 0x3f;
 669                 char mid  = (c >> 6) & 0x3F;
 670                 char high = c >> 12;
 671                         left = left - 3;
 672                 if ((int) left < 0) break;
 673                 *pos++ = high | 0xE0;
 674                 *pos++ = mid  | 0x80;
 675                 *pos++ = low  | 0x80;
 676                 }
 677         }
 678
 679         /* insert utf-string into symbol-table */
 680         result = utf_new(buffer,buflength);
 681
 682         MFREE(buffer, char, buflength);
 683
 684         return result;
 685 }
 686
 687
 688 /* utf_new_char ****************************************************************
 689
 690    Creates a new utf symbol, the text for this symbol is passed as a
 691    c-string ( = char* ).
 692
 693 *******************************************************************************/
 694
 695 utf *utf_new_char(const char *text)
 696 {
 697         return utf_new(text, strlen(text));
 698 }
 699
 700
 701 /* utf_new_char_classname ******************************************************
 702
 703    Creates a new utf symbol, the text for this symbol is passed as a
 704    c-string ( = char* ) "." characters are going to be replaced by
 705    "/". Since the above function is used often, this is a separte
 706    function, instead of an if.
 707
 708 *******************************************************************************/
 709
 710 utf *utf_new_char_classname(const char *text)
 711 {
 712         if (strchr(text, '.')) {
 713                 char *txt = strdup(text);
 714                 char *end = txt + strlen(txt);
 715                 char *c;
 716                 utf *tmpRes;
 717
 718                 for (c = txt; c < end; c++)
 719                         if (*c == '.') *c = '/';
 720
 721                 tmpRes = utf_new(txt, strlen(txt));
 722                 FREE(txt, 0);
 723
 724                 return tmpRes;
 725
 726         } else
 727                 return utf_new(text, strlen(text));
 728 }
 729
 730
 731 /* utf_nextu2 ******************************************************************
 732
 733    Read the next unicode character from the utf string and increment
 734    the utf-string pointer accordingly.
 735
 736 *******************************************************************************/
 737
 738 u2 utf_nextu2(char **utf_ptr)
 739 {
 740     /* uncompressed unicode character */
 741     u2 unicode_char = 0;
 742     /* current position in utf text */
 743     unsigned char *utf = (unsigned char *) (*utf_ptr);
 744     /* bytes representing the unicode character */
 745     unsigned char ch1, ch2, ch3;
 746     /* number of bytes used to represent the unicode character */
 747     int len = 0;
 748
 749     switch ((ch1 = utf[0]) >> 4) {
 750         default: /* 1 byte */
 751                 (*utf_ptr)++;
 752                 return (u2) ch1;
 753         case 0xC:
 754         case 0xD: /* 2 bytes */
 755                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 756                         unsigned char high = ch1 & 0x1F;
 757                         unsigned char low  = ch2 & 0x3F;
 758                         unicode_char = (high << 6) + low;
 759                         len = 2;
 760                 }
 761                 break;
 762
 763         case 0xE: /* 2 or 3 bytes */
 764                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 765                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 766                                 unsigned char low  = ch3 & 0x3f;
 767                                 unsigned char mid  = ch2 & 0x3f;
 768                                 unsigned char high = ch1 & 0x0f;
 769                                 unicode_char = (((high << 6) + mid) << 6) + low;
 770                                 len = 3;
 771                         } else
 772                                 len = 2;
 773                 }
 774                 break;
 775     }
 776
 777     /* update position in utf-text */
 778     *utf_ptr = (char *) (utf + len);
 779
 780     return unicode_char;
 781 }
 782
 783
 784 /* utf_bytes *******************************************************************
 785
 786    Determine number of bytes (aka. octets) in the utf string.
 787
 788    IN:
 789       u............utf string
 790
 791    OUT:
 792       The number of octets of this utf string.
 793           There is _no_ terminating zero included in this count.
 794
 795 *******************************************************************************/
 796
 797 u4 utf_bytes(utf *u)
 798 {
 799         return u->blength;
 800 }
 801
 802 /* utf_get_number_of_u2s_for_buffer ********************************************
 803
 804    Determine number of UTF-16 u2s in the given UTF-8 buffer
 805
 806    CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
 807    to an array of u2s (UTF-16) and want to know how many of them you will get.
 808    All other uses of this function are probably wrong.
 809
 810    IN:
 811       buffer........points to first char in buffer
 812           blength.......number of _bytes_ in the buffer
 813
 814    OUT:
 815       the number of u2s needed to hold this string in UTF-16 encoding.
 816           There is _no_ terminating zero included in this count.
 817
 818    NOTE: Unlike utf_get_number_of_u2s, this function never throws an
 819    exception.
 820
 821 *******************************************************************************/
 822
 823 u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
 824 {
 825         const char *endpos;                 /* points behind utf string           */
 826         const char *utf_ptr;                /* current position in utf text       */
 827         u4 len = 0;                         /* number of unicode characters       */
 828
 829         utf_ptr = buffer;
 830         endpos = utf_ptr + blength;
 831
 832         while (utf_ptr < endpos) {
 833                 len++;
 834                 /* next unicode character */
 835                 utf_nextu2((char **)&utf_ptr);
 836         }
 837
 838         assert(utf_ptr == endpos);
 839
 840         return len;
 841 }
 842
 843
 844 /* utf_get_number_of_u2s *******************************************************
 845
 846    Determine number of UTF-16 u2s in the utf string.
 847
 848    CAUTION: Use this function *only* when you want to convert a utf string
 849    to an array of u2s and want to know how many of them you will get.
 850    All other uses of this function are probably wrong.
 851
 852    IN:
 853       u............utf string
 854
 855    OUT:
 856       the number of u2s needed to hold this string in UTF-16 encoding.
 857           There is _no_ terminating zero included in this count.
 858           XXX 0 if a NullPointerException has been thrown (see below)
 859
 860 *******************************************************************************/
 861
 862 u4 utf_get_number_of_u2s(utf *u)
 863 {
 864         char *endpos;                       /* points behind utf string           */
 865         char *utf_ptr;                      /* current position in utf text       */
 866         u4 len = 0;                         /* number of unicode characters       */
 867
 868         /* XXX this is probably not checked by most callers! Review this after */
 869         /* the invalid uses of this function have been eliminated */
 870         if (!u) {
 871                 exceptions_throw_nullpointerexception();
 872                 return 0;
 873         }
 874
 875         endpos = UTF_END(u);
 876         utf_ptr = u->text;
 877
 878         while (utf_ptr < endpos) {
 879                 len++;
 880                 /* next unicode character */
 881                 utf_nextu2(&utf_ptr);
 882         }
 883
 884         if (utf_ptr != endpos)
 885                 /* string ended abruptly */
 886                 throw_cacao_exception_exit(string_java_lang_InternalError,
 887                                                                    "Illegal utf8 string");
 888
 889         return len;
 890 }
 891
 892
 893 /* u2_utflength ****************************************************************
 894
 895    Returns the utf length in bytes of a u2 array.
 896
 897 *******************************************************************************/
 898
 899 u4 u2_utflength(u2 *text, u4 u2_length)
 900 {
 901         u4 result_len = 0;                  /* utf length in bytes                */
 902         u2 ch;                              /* current unicode character          */
 903         u4 len;
 904
 905         for (len = 0; len < u2_length; len++) {
 906                 /* next unicode character */
 907                 ch = *text++;
 908
 909                 /* determine bytes required to store unicode character as utf */
 910                 if (ch && (ch < 0x80))
 911                         result_len++;
 912                 else if (ch < 0x800)
 913                         result_len += 2;
 914                 else
 915                         result_len += 3;
 916         }
 917
 918     return result_len;
 919 }
 920
 921
 922 /* utf_copy ********************************************************************
 923
 924    Copy the given utf string byte-for-byte to a buffer.
 925
 926    IN:
 927       buffer.......the buffer
 928           u............the utf string
 929
 930 *******************************************************************************/
 931
 932 void utf_copy(char *buffer, utf *u)
 933 {
 934         /* our utf strings are zero-terminated (done by utf_new) */
 935         MCOPY(buffer, u->text, char, u->blength + 1);
 936 }
 937
 938
 939 /* utf_cat *********************************************************************
 940
 941    Append the given utf string byte-for-byte to a buffer.
 942
 943    IN:
 944       buffer.......the buffer
 945           u............the utf string
 946
 947 *******************************************************************************/
 948
 949 void utf_cat(char *buffer, utf *u)
 950 {
 951         /* our utf strings are zero-terminated (done by utf_new) */
 952         MCOPY(buffer + strlen(buffer), u->text, char, u->blength + 1);
 953 }
 954
 955
 956 /* utf_copy_classname **********************************************************
 957
 958    Copy the given utf classname byte-for-byte to a buffer.
 959    '/' is replaced by '.'
 960
 961    IN:
 962       buffer.......the buffer
 963           u............the utf string
 964
 965 *******************************************************************************/
 966
 967 void utf_copy_classname(char *buffer, utf *u)
 968 {
 969         char *bufptr;
 970         char *srcptr;
 971         char *endptr;
 972         char ch;
 973
 974         bufptr = buffer;
 975         srcptr = u->text;
 976         endptr = UTF_END(u) + 1; /* utfs are zero-terminared by utf_new */
 977
 978         while (srcptr != endptr) {
 979                 ch = *srcptr++;
 980                 if (ch == '/')
 981                         ch = '.';
 982                 *bufptr++ = ch;
 983         }
 984 }
 985
 986
 987 /* utf_cat *********************************************************************
 988
 989    Append the given utf classname byte-for-byte to a buffer.
 990    '/' is replaced by '.'
 991
 992    IN:
 993       buffer.......the buffer
 994           u............the utf string
 995
 996 *******************************************************************************/
 997
 998 void utf_cat_classname(char *buffer, utf *u)
 999 {
1000         utf_copy_classname(buffer + strlen(buffer), u);
1001 }
1002
1003 /* utf_display_printable_ascii *************************************************
1004
1005    Write utf symbol to stdout (for debugging purposes).
1006    Non-printable and non-ASCII characters are printed as '?'.
1007
1008 *******************************************************************************/
1009
1010 void utf_display_printable_ascii(utf *u)
1011 {
1012         char *endpos;                       /* points behind utf string           */
1013         char *utf_ptr;                      /* current position in utf text       */
1014
1015         if (u == NULL) {
1016                 printf("NULL");
1017                 fflush(stdout);
1018                 return;
1019         }
1020
1021         endpos = UTF_END(u);
1022         utf_ptr = u->text;
1023
1024         while (utf_ptr < endpos) {
1025                 /* read next unicode character */
1026
1027                 u2 c = utf_nextu2(&utf_ptr);
1028
1029                 if ((c >= 32) && (c <= 127))
1030                         printf("%c", c);
1031                 else
1032                         printf("?");
1033         }
1034
1035         fflush(stdout);
1036 }
1037
1038
1039 /* utf_display_printable_ascii_classname ***************************************
1040
1041    Write utf symbol to stdout with `/' converted to `.' (for debugging
1042    purposes).
1043    Non-printable and non-ASCII characters are printed as '?'.
1044
1045 *******************************************************************************/
1046
1047 void utf_display_printable_ascii_classname(utf *u)
1048 {
1049         char *endpos;                       /* points behind utf string           */
1050         char *utf_ptr;                      /* current position in utf text       */
1051
1052         if (u == NULL) {
1053                 printf("NULL");
1054                 fflush(stdout);
1055                 return;
1056         }
1057
1058         endpos = UTF_END(u);
1059         utf_ptr = u->text;
1060
1061         while (utf_ptr < endpos) {
1062                 /* read next unicode character */
1063
1064                 u2 c = utf_nextu2(&utf_ptr);
1065
1066                 if (c == '/')
1067                         c = '.';
1068
1069                 if ((c >= 32) && (c <= 127))
1070                         printf("%c", c);
1071                 else
1072                         printf("?");
1073         }
1074
1075         fflush(stdout);
1076 }
1077
1078
1079 /* utf_sprint_convert_to_latin1 ************************************************
1080
1081    Write utf symbol into c-string (for debugging purposes).
1082    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1083    invalid results.
1084
1085 *******************************************************************************/
1086
1087 void utf_sprint_convert_to_latin1(char *buffer, utf *u)
1088 {
1089         char *endpos;                       /* points behind utf string           */
1090         char *utf_ptr;                      /* current position in utf text       */
1091         u2 pos = 0;                         /* position in c-string               */
1092
1093         if (!u) {
1094                 strcpy(buffer, "NULL");
1095                 return;
1096         }
1097
1098         endpos = UTF_END(u);
1099         utf_ptr = u->text;
1100
1101         while (utf_ptr < endpos)
1102                 /* copy next unicode character */
1103                 buffer[pos++] = utf_nextu2(&utf_ptr);
1104
1105         /* terminate string */
1106         buffer[pos] = '\0';
1107 }
1108
1109
1110 /* utf_sprint_convert_to_latin1_classname **************************************
1111
1112    Write utf symbol into c-string with `/' converted to `.' (for debugging
1113    purposes).
1114    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1115    invalid results.
1116
1117 *******************************************************************************/
1118
1119 void utf_sprint_convert_to_latin1_classname(char *buffer, utf *u)
1120 {
1121         char *endpos;                       /* points behind utf string           */
1122         char *utf_ptr;                      /* current position in utf text       */
1123         u2 pos = 0;                         /* position in c-string               */
1124
1125         if (!u) {
1126                 strcpy(buffer, "NULL");
1127                 return;
1128         }
1129
1130         endpos = UTF_END(u);
1131         utf_ptr = u->text;
1132
1133         while (utf_ptr < endpos) {
1134                 /* copy next unicode character */
1135                 u2 c = utf_nextu2(&utf_ptr);
1136                 if (c == '/') c = '.';
1137                 buffer[pos++] = c;
1138         }
1139
1140         /* terminate string */
1141         buffer[pos] = '\0';
1142 }
1143
1144
1145 /* utf_strcat_convert_to_latin1 ************************************************
1146
1147    Like libc strcat, but uses an utf8 string.
1148    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1149    invalid results.
1150
1151 *******************************************************************************/
1152
1153 void utf_strcat_convert_to_latin1(char *buffer, utf *u)
1154 {
1155         utf_sprint_convert_to_latin1(buffer + strlen(buffer), u);
1156 }
1157
1158
1159 /* utf_strcat_convert_to_latin1_classname **************************************
1160
1161    Like libc strcat, but uses an utf8 string.
1162    Characters are converted to 8-bit Latin-1, non-Latin-1 characters yield
1163    invalid results.
1164
1165 *******************************************************************************/
1166
1167 void utf_strcat_convert_to_latin1_classname(char *buffer, utf *u)
1168 {
1169         utf_sprint_convert_to_latin1_classname(buffer + strlen(buffer), u);
1170 }
1171
1172
1173 /* utf_fprint_printable_ascii **************************************************
1174
1175    Write utf symbol into file.
1176    Non-printable and non-ASCII characters are printed as '?'.
1177
1178 *******************************************************************************/
1179
1180 void utf_fprint_printable_ascii(FILE *file, utf *u)
1181 {
1182         char *endpos;                       /* points behind utf string           */
1183         char *utf_ptr;                      /* current position in utf text       */
1184
1185         if (!u)
1186                 return;
1187
1188         endpos = UTF_END(u);
1189         utf_ptr = u->text;
1190
1191         while (utf_ptr < endpos) {
1192                 /* read next unicode character */
1193                 u2 c = utf_nextu2(&utf_ptr);
1194
1195                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1196                 else fprintf(file, "?");
1197         }
1198 }
1199
1200
1201 /* utf_fprint_printable_ascii_classname ****************************************
1202
1203    Write utf symbol into file with `/' converted to `.'.
1204    Non-printable and non-ASCII characters are printed as '?'.
1205
1206 *******************************************************************************/
1207
1208 void utf_fprint_printable_ascii_classname(FILE *file, utf *u)
1209 {
1210         char *endpos;                       /* points behind utf string           */
1211         char *utf_ptr;                      /* current position in utf text       */
1212
1213     if (!u)
1214                 return;
1215
1216         endpos = UTF_END(u);
1217         utf_ptr = u->text;
1218
1219         while (utf_ptr < endpos) {
1220                 /* read next unicode character */
1221                 u2 c = utf_nextu2(&utf_ptr);
1222                 if (c == '/') c = '.';
1223
1224                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
1225                 else fprintf(file, "?");
1226         }
1227 }
1228
1229
1230 /* is_valid_utf ****************************************************************
1231
1232    Return true if the given string is a valid UTF-8 string.
1233
1234    utf_ptr...points to first character
1235    end_pos...points after last character
1236
1237 *******************************************************************************/
1238
1239 /*  static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26}; */
1240
1241 bool is_valid_utf(char *utf_ptr, char *end_pos)
1242 {
1243         int bytes;
1244         int len,i;
1245         char c;
1246         unsigned long v;
1247
1248         if (end_pos < utf_ptr) return false;
1249         bytes = end_pos - utf_ptr;
1250         while (bytes--) {
1251                 c = *utf_ptr++;
1252
1253                 if (!c) return false;                     /* 0x00 is not allowed */
1254                 if ((c & 0x80) == 0) continue;            /* ASCII */
1255
1256                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
1257                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
1258                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
1259                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
1260                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
1261                 else return false;                        /* invalid leading byte */
1262
1263                 if (len > 2) return false;                /* Java limitation */
1264
1265                 v = (unsigned long)c & (0x3f >> len);
1266
1267                 if ((bytes -= len) < 0) return false;     /* missing bytes */
1268
1269                 for (i = len; i--; ) {
1270                         c = *utf_ptr++;
1271                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
1272                                 return false;
1273                         v = (v << 6) | (c & 0x3f);
1274                 }
1275
1276                 if (v == 0) {
1277                         if (len != 1) return false;           /* Java special */
1278
1279                 } else {
1280                         /* Sun Java seems to allow overlong UTF-8 encodings */
1281
1282                         /* if (v < min_codepoint[len]) */
1283                                 /* XXX throw exception? */
1284                 }
1285
1286                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
1287                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
1288
1289                 /* even these seem to be allowed */
1290                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
1291         }
1292
1293         return true;
1294 }
1295
1296
1297 /* is_valid_name ***************************************************************
1298
1299    Return true if the given string may be used as a class/field/method
1300    name. (Currently this only disallows empty strings and control
1301    characters.)
1302
1303    NOTE: The string is assumed to have passed is_valid_utf!
1304
1305    utf_ptr...points to first character
1306    end_pos...points after last character
1307
1308 *******************************************************************************/
1309
1310 bool is_valid_name(char *utf_ptr, char *end_pos)
1311 {
1312         if (end_pos <= utf_ptr) return false; /* disallow empty names */
1313
1314         while (utf_ptr < end_pos) {
1315                 unsigned char c = *utf_ptr++;
1316
1317                 if (c < 0x20) return false; /* disallow control characters */
1318                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
1319                         return false;
1320         }
1321
1322         return true;
1323 }
1324
1325 bool is_valid_name_utf(utf *u)
1326 {
1327         return is_valid_name(u->text, UTF_END(u));
1328 }
1329
1330
1331 /* utf_show ********************************************************************
1332
1333    Writes the utf symbols in the utfhash to stdout and displays the
1334    number of external hash chains grouped according to the chainlength
1335    (for debugging purposes).
1336
1337 *******************************************************************************/
1338
1339 #if !defined(NDEBUG)
1340 void utf_show(void)
1341 {
1342
1343 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
1344
1345         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
1346         u4 max_chainlength = 0;      /* maximum length of the chains */
1347         u4 sum_chainlength = 0;      /* sum of the chainlengths */
1348         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1349         u4 i;
1350
1351         printf("UTF-HASH:\n");
1352
1353         /* show element of utf-hashtable */
1354
1355         for (i = 0; i < hashtable_utf->size; i++) {
1356                 utf *u = hashtable_utf->ptr[i];
1357
1358                 if (u) {
1359                         printf("SLOT %d: ", (int) i);
1360
1361                         while (u) {
1362                                 printf("'");
1363                                 utf_display_printable_ascii(u);
1364                                 printf("' ");
1365                                 u = u->hashlink;
1366                         }
1367                         printf("\n");
1368                 }
1369         }
1370
1371         printf("UTF-HASH: %d slots for %d entries\n",
1372                    (int) hashtable_utf->size, (int) hashtable_utf->entries );
1373
1374         if (hashtable_utf->entries == 0)
1375                 return;
1376
1377         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1378
1379         for (i=0;i<CHAIN_LIMIT;i++)
1380                 chain_count[i]=0;
1381
1382         /* count numbers of hashchains according to their length */
1383         for (i=0; i<hashtable_utf->size; i++) {
1384
1385                 utf *u = (utf*) hashtable_utf->ptr[i];
1386                 u4 chain_length = 0;
1387
1388                 /* determine chainlength */
1389                 while (u) {
1390                         u = u->hashlink;
1391                         chain_length++;
1392                 }
1393
1394                 /* update sum of all chainlengths */
1395                 sum_chainlength+=chain_length;
1396
1397                 /* determine the maximum length of the chains */
1398                 if (chain_length>max_chainlength)
1399                         max_chainlength = chain_length;
1400
1401                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1402                 if (chain_length>=CHAIN_LIMIT) {
1403                         beyond_limit+=chain_length;
1404                         chain_length=CHAIN_LIMIT-1;
1405                 }
1406
1407                 /* update number of hashchains of current length */
1408                 chain_count[chain_length]++;
1409         }
1410
1411         /* display results */
1412         for (i=1;i<CHAIN_LIMIT-1;i++)
1413                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/hashtable_utf->entries));
1414
1415         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/hashtable_utf->entries);
1416
1417
1418         printf("max. chainlength:%5d\n",max_chainlength);
1419
1420         /* avg. chainlength = sum of chainlengths / number of chains */
1421         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (hashtable_utf->size-chain_count[0]));
1422 }
1423 #endif /* !defined(NDEBUG) */
1424
1425
1426 /*
1427  * These are local overrides for various environment variables in Emacs.
1428  * Please do not remove this and leave it at the end of the file, where
1429  * Emacs will automagically detect them.
1430  * ---------------------------------------------------------------------
1431  * Local variables:
1432  * mode: c
1433  * indent-tabs-mode: t
1434  * c-basic-offset: 4
1435  * tab-width: 4
1436  * End:
1437  * vim:noexpandtab:sw=4:ts=4:
1438  */