src/vm/utf8.c

   1 /* src/vm/utf.c - utf functions
   2
   3    Copyright (C) 1996-2005 R. Grafl, A. Krall, C. Kruegel, C. Oates,
   4    R. Obermaisser, M. Platter, M. Probst, S. Ring, E. Steiner,
   5    C. Thalinger, D. Thuernbeck, P. Tomsich, C. Ullrich, J. Wenninger,
   6    Institut f. Computersprachen - TU Wien
   7
   8    This file is part of CACAO.
   9
  10    This program is free software; you can redistribute it and/or
  11    modify it under the terms of the GNU General Public License as
  12    published by the Free Software Foundation; either version 2, or (at
  13    your option) any later version.
  14
  15    This program is distributed in the hope that it will be useful, but
  16    WITHOUT ANY WARRANTY; without even the implied warranty of
  17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18    General Public License for more details.
  19
  20    You should have received a copy of the GNU General Public License
  21    along with this program; if not, write to the Free Software
  22    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  23    02111-1307, USA.
  24
  25    Contact: cacao@complang.tuwien.ac.at
  26
  27    Authors: Reinhard Grafl
  28
  29    Changes: Mark Probst
  30             Andreas Krall
  31             Christian Thalinger
  32
  33    $Id: utf8.c 2061 2005-03-23 11:10:33Z twisti $
  34
  35 */
  36
  37
  38 #include <string.h>
  39
  40 #include "mm/memory.h"
  41 #include "vm/exceptions.h"
  42 #include "vm/options.h"
  43 #include "vm/statistics.h"
  44 #include "vm/tables.h"
  45 #include "vm/utf8.h"
  46
  47
  48 hashtable utf_hash;                     /* hashtable for utf8-symbols         */
  49
  50
  51 /* utf-symbols for pointer comparison of frequently used strings **************/
  52
  53 utf *utf_java_lang_Object;              /* java/lang/Object                   */
  54
  55 utf *utf_java_lang_Class;
  56 utf *utf_java_lang_ClassLoader;
  57 utf *utf_java_lang_Cloneable;
  58 utf *utf_java_lang_SecurityManager;
  59 utf *utf_java_lang_String;
  60 utf *utf_java_lang_System;
  61 utf *utf_java_io_Serializable;
  62
  63 utf *utf_java_lang_Throwable;
  64 utf *utf_java_lang_VMThrowable;
  65 utf *utf_java_lang_Exception;
  66 utf *utf_java_lang_Error;
  67 utf *utf_java_lang_OutOfMemoryError;
  68
  69 utf* utf_java_lang_Void;
  70 utf* utf_java_lang_Boolean;
  71 utf* utf_java_lang_Byte;
  72 utf* utf_java_lang_Character;
  73 utf* utf_java_lang_Short;
  74 utf* utf_java_lang_Integer;
  75 utf* utf_java_lang_Long;
  76 utf* utf_java_lang_Float;
  77 utf* utf_java_lang_Double;
  78
  79 utf *utf_java_util_Vector;
  80
  81 utf *utf_InnerClasses;                  /* InnerClasses                       */
  82 utf *utf_ConstantValue;                 /* ConstantValue                      */
  83 utf *utf_Code;                          /* Code                               */
  84 utf *utf_Exceptions;                    /* Exceptions                         */
  85 utf *utf_LineNumberTable;               /* LineNumberTable                    */
  86 utf *utf_SourceFile;                    /* SourceFile                         */
  87
  88 utf *utf_init;                          /* <init>                             */
  89 utf *utf_clinit;                        /* <clinit>                           */
  90 utf *utf_finalize;                      /* finalize                           */
  91
  92 utf *utf_printStackTrace;
  93 utf *utf_fillInStackTrace;
  94 utf *utf_loadClass;
  95
  96 utf *utf_void__void;                    /* ()V                                */
  97 utf *utf_boolean__void;                 /* (Z)V                               */
  98 utf *utf_byte__void;                    /* (B)V                               */
  99 utf *utf_char__void;                    /* (C)V                               */
 100 utf *utf_short__void;                   /* (S)V                               */
 101 utf *utf_int__void;                     /* (I)V                               */
 102 utf *utf_long__void;                    /* (J)V                               */
 103 utf *utf_float__void;                   /* (F)V                               */
 104 utf *utf_double__void;                  /* (D)V                               */
 105 utf *utf_void__java_lang_Throwable;     /* ()Ljava/lang/Throwable;            */
 106 utf *utf_java_lang_String__void;        /* (Ljava/lang/String;)V              */
 107 utf *utf_java_lang_String__java_lang_Class;
 108 utf *utf_java_lang_Throwable__void;     /* (Ljava/lang/Throwable;)V           */
 109
 110 utf *array_packagename;
 111
 112
 113 /* utf_init ********************************************************************
 114
 115    Initializes the utf8 subsystem.
 116
 117 *******************************************************************************/
 118
 119 void utf8_init(void)
 120 {
 121         /* create utf-symbols for pointer comparison of frequently used strings */
 122
 123         utf_java_lang_Object           = utf_new_char("java/lang/Object");
 124
 125         utf_java_lang_Class            = utf_new_char("java/lang/Class");
 126         utf_java_lang_ClassLoader      = utf_new_char("java/lang/ClassLoader");
 127         utf_java_lang_Cloneable        = utf_new_char("java/lang/Cloneable");
 128         utf_java_lang_SecurityManager  = utf_new_char("java/lang/SecurityManager");
 129         utf_java_lang_String           = utf_new_char("java/lang/String");
 130         utf_java_lang_System           = utf_new_char("java/lang/System");
 131         utf_java_io_Serializable       = utf_new_char("java/io/Serializable");
 132
 133         utf_java_lang_Throwable        = utf_new_char("java/lang/Throwable");
 134         utf_java_lang_VMThrowable      = utf_new_char("java/lang/VMThrowable");
 135         utf_java_lang_Exception        = utf_new_char("java/lang/Exception");
 136         utf_java_lang_Error            = utf_new_char("java/lang/Error");
 137         utf_java_lang_OutOfMemoryError = utf_new_char("java/lang/OutOfMemoryError");
 138
 139         utf_java_lang_Void             = utf_new_char("java/lang/Void");
 140         utf_java_lang_Boolean          = utf_new_char("java/lang/Boolean");
 141         utf_java_lang_Byte             = utf_new_char("java/lang/Byte");
 142         utf_java_lang_Character        = utf_new_char("java/lang/Character");
 143         utf_java_lang_Short            = utf_new_char("java/lang/Short");
 144         utf_java_lang_Integer          = utf_new_char("java/lang/Integer");
 145         utf_java_lang_Long             = utf_new_char("java/lang/Long");
 146         utf_java_lang_Float            = utf_new_char("java/lang/Float");
 147         utf_java_lang_Double           = utf_new_char("java/lang/Double");
 148
 149         utf_java_util_Vector           = utf_new_char("java/util/Vector");
 150
 151         utf_InnerClasses               = utf_new_char("InnerClasses");
 152         utf_ConstantValue              = utf_new_char("ConstantValue");
 153         utf_Code                       = utf_new_char("Code");
 154         utf_Exceptions                 = utf_new_char("Exceptions");
 155         utf_LineNumberTable            = utf_new_char("LineNumberTable");
 156         utf_SourceFile                 = utf_new_char("SourceFile");
 157
 158         utf_init                           = utf_new_char("<init>");
 159         utf_clinit                         = utf_new_char("<clinit>");
 160         utf_finalize                   = utf_new_char("finalize");
 161
 162         utf_printStackTrace            = utf_new_char("printStackTrace");
 163         utf_fillInStackTrace           = utf_new_char("fillInStackTrace");
 164         utf_loadClass                  = utf_new_char("loadClass");
 165
 166         utf_void__void                 = utf_new_char("()V");
 167         utf_boolean__void              = utf_new_char("(Z)V");
 168         utf_byte__void                 = utf_new_char("(B)V");
 169         utf_char__void                 = utf_new_char("(C)V");
 170         utf_short__void                = utf_new_char("(S)V");
 171         utf_int__void                  = utf_new_char("(I)V");
 172         utf_long__void                 = utf_new_char("(J)V");
 173         utf_float__void                = utf_new_char("(F)V");
 174         utf_double__void               = utf_new_char("(D)V");
 175         utf_void__java_lang_Throwable  = utf_new_char("()Ljava/lang/Throwable;");
 176         utf_java_lang_String__void     = utf_new_char("(Ljava/lang/String;)V");
 177
 178         utf_java_lang_String__java_lang_Class =
 179                 utf_new_char("(Ljava/lang/String;)Ljava/lang/Class;");
 180
 181         utf_java_lang_Throwable__void  = utf_new_char("(Ljava/lang/Throwable;)V");
 182         array_packagename              = utf_new_char("<the array package>");
 183 }
 184
 185
 186 /* utf_hashkey *****************************************************************
 187
 188    The hashkey is computed from the utf-text by using up to 8
 189    characters.  For utf-symbols longer than 15 characters 3 characters
 190    are taken from the beginning and the end, 2 characters are taken
 191    from the middle.
 192
 193 *******************************************************************************/
 194
 195 #define nbs(val) ((u4) *(++text) << val) /* get next byte, left shift by val  */
 196 #define fbs(val) ((u4) *(  text) << val) /* get first byte, left shift by val */
 197
 198 u4 utf_hashkey(const char *text, u4 length)
 199 {
 200         const char *start_pos = text;       /* pointer to utf text                */
 201         u4 a;
 202
 203         switch (length) {
 204         case 0: /* empty string */
 205                 return 0;
 206
 207         case 1: return fbs(0);
 208         case 2: return fbs(0) ^ nbs(3);
 209         case 3: return fbs(0) ^ nbs(3) ^ nbs(5);
 210         case 4: return fbs(0) ^ nbs(2) ^ nbs(4) ^ nbs(6);
 211         case 5: return fbs(0) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(6);
 212         case 6: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(5) ^ nbs(6);
 213         case 7: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6);
 214         case 8: return fbs(0) ^ nbs(1) ^ nbs(2) ^ nbs(3) ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7);
 215
 216         case 9:
 217                 a = fbs(0);
 218                 a ^= nbs(1);
 219                 a ^= nbs(2);
 220                 text++;
 221                 return a ^ nbs(4) ^ nbs(5) ^ nbs(6) ^ nbs(7) ^ nbs(8);
 222
 223         case 10:
 224                 a = fbs(0);
 225                 text++;
 226                 a ^= nbs(2);
 227                 a ^= nbs(3);
 228                 a ^= nbs(4);
 229                 text++;
 230                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9);
 231
 232         case 11:
 233                 a = fbs(0);
 234                 text++;
 235                 a ^= nbs(2);
 236                 a ^= nbs(3);
 237                 a ^= nbs(4);
 238                 text++;
 239                 return a ^ nbs(6) ^ nbs(7) ^ nbs(8) ^ nbs(9) ^ nbs(10);
 240
 241         case 12:
 242                 a = fbs(0);
 243                 text += 2;
 244                 a ^= nbs(2);
 245                 a ^= nbs(3);
 246                 text++;
 247                 a ^= nbs(5);
 248                 a ^= nbs(6);
 249                 a ^= nbs(7);
 250                 text++;
 251                 return a ^ nbs(9) ^ nbs(10);
 252
 253         case 13:
 254                 a = fbs(0);
 255                 a ^= nbs(1);
 256                 text++;
 257                 a ^= nbs(3);
 258                 a ^= nbs(4);
 259                 text += 2;
 260                 a ^= nbs(7);
 261                 a ^= nbs(8);
 262                 text += 2;
 263                 return a ^ nbs(9) ^ nbs(10);
 264
 265         case 14:
 266                 a = fbs(0);
 267                 text += 2;
 268                 a ^= nbs(3);
 269                 a ^= nbs(4);
 270                 text += 2;
 271                 a ^= nbs(7);
 272                 a ^= nbs(8);
 273                 text += 2;
 274                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 275
 276         case 15:
 277                 a = fbs(0);
 278                 text += 2;
 279                 a ^= nbs(3);
 280                 a ^= nbs(4);
 281                 text += 2;
 282                 a ^= nbs(7);
 283                 a ^= nbs(8);
 284                 text += 2;
 285                 return a ^ nbs(9) ^ nbs(10) ^ nbs(11);
 286
 287         default:  /* 3 characters from beginning */
 288                 a = fbs(0);
 289                 text += 2;
 290                 a ^= nbs(3);
 291                 a ^= nbs(4);
 292
 293                 /* 2 characters from middle */
 294                 text = start_pos + (length / 2);
 295                 a ^= fbs(5);
 296                 text += 2;
 297                 a ^= nbs(6);
 298
 299                 /* 3 characters from end */
 300                 text = start_pos + length - 4;
 301
 302                 a ^= fbs(7);
 303                 text++;
 304
 305                 return a ^ nbs(10) ^ nbs(11);
 306     }
 307 }
 308
 309
 310 /* utf_hashkey *****************************************************************
 311
 312    Compute the hashkey of a unicode string.
 313
 314 *******************************************************************************/
 315
 316 u4 unicode_hashkey(u2 *text, u2 len)
 317 {
 318         return utf_hashkey((char *) text, len);
 319 }
 320
 321
 322 /* utf_new *********************************************************************
 323
 324    Creates a new utf-symbol, the text of the symbol is passed as a
 325    u1-array. The function searches the utf-hashtable for a utf-symbol
 326    with this text. On success the element returned, otherwise a new
 327    hashtable element is created.
 328
 329    If the number of entries in the hashtable exceeds twice the size of
 330    the hashtable slots a reorganization of the hashtable is done and
 331    the utf symbols are copied to a new hashtable with doubled size.
 332
 333 *******************************************************************************/
 334
 335 utf *utf_new_intern(const char *text, u2 length);
 336
 337 utf *utf_new(const char *text, u2 length)
 338 {
 339     utf *r;
 340
 341 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 342     tables_lock();
 343 #endif
 344
 345     r = utf_new_intern(text, length);
 346
 347 #if defined(USE_THREADS) && defined(NATIVE_THREADS)
 348     tables_unlock();
 349 #endif
 350
 351     return r;
 352 }
 353
 354
 355 utf *utf_new_intern(const char *text, u2 length)
 356 {
 357         u4 key;                             /* hashkey computed from utf-text     */
 358         u4 slot;                            /* slot in hashtable                  */
 359         utf *u;                             /* hashtable element                  */
 360         u2 i;
 361
 362 #ifdef STATISTICS
 363         if (opt_stat)
 364                 count_utf_new++;
 365 #endif
 366
 367         key  = utf_hashkey(text, length);
 368         slot = key & (utf_hash.size - 1);
 369         u    = utf_hash.ptr[slot];
 370
 371         /* search external hash chain for utf-symbol */
 372         while (u) {
 373                 if (u->blength == length) {
 374
 375                         /* compare text of hashtable elements */
 376                         for (i = 0; i < length; i++)
 377                                 if (text[i] != u->text[i]) goto nomatch;
 378
 379 #ifdef STATISTICS
 380                         if (opt_stat)
 381                                 count_utf_new_found++;
 382 #endif
 383
 384                         /* symbol found in hashtable */
 385                         return u;
 386                 }
 387         nomatch:
 388                 u = u->hashlink; /* next element in external chain */
 389         }
 390
 391 #ifdef STATISTICS
 392         if (opt_stat)
 393                 count_utf_len += sizeof(utf) + length;
 394 #endif
 395
 396         /* location in hashtable found, create new utf element */
 397         u = NEW(utf);
 398         u->blength  = length;               /* length in bytes of utfstring       */
 399         u->hashlink = utf_hash.ptr[slot];   /* link in external hashchain         */
 400         u->text     = mem_alloc(length + 1);/* allocate memory for utf-text       */
 401         memcpy(u->text, text, length);      /* copy utf-text                      */
 402         u->text[length] = '\0';
 403         utf_hash.ptr[slot] = u;             /* insert symbol into table           */
 404
 405         utf_hash.entries++;                 /* update number of entries           */
 406
 407         if (utf_hash.entries > (utf_hash.size * 2)) {
 408
 409         /* reorganization of hashtable, average length of
 410            the external chains is approx. 2                */
 411
 412                 u4 i;
 413                 utf *u;
 414                 hashtable newhash; /* the new hashtable */
 415
 416                 /* create new hashtable, double the size */
 417                 init_hashtable(&newhash, utf_hash.size * 2);
 418                 newhash.entries = utf_hash.entries;
 419
 420 #ifdef STATISTICS
 421                 if (opt_stat)
 422                         count_utf_len += sizeof(utf*) * utf_hash.size;
 423 #endif
 424
 425                 /* transfer elements to new hashtable */
 426                 for (i = 0; i < utf_hash.size; i++) {
 427                         u = (utf *) utf_hash.ptr[i];
 428                         while (u) {
 429                                 utf *nextu = u->hashlink;
 430                                 u4 slot = utf_hashkey(u->text, u->blength) & (newhash.size - 1);
 431
 432                                 u->hashlink = (utf *) newhash.ptr[slot];
 433                                 newhash.ptr[slot] = u;
 434
 435                                 /* follow link in external hash chain */
 436                                 u = nextu;
 437                         }
 438                 }
 439
 440                 /* dispose old table */
 441                 MFREE(utf_hash.ptr, void*, utf_hash.size);
 442                 utf_hash = newhash;
 443         }
 444
 445         return u;
 446 }
 447
 448
 449 /* utf_new_u2 ******************************************************************
 450
 451    Make utf symbol from u2 array, if isclassname is true '.' is
 452    replaced by '/'.
 453
 454 *******************************************************************************/
 455
 456 utf *utf_new_u2(u2 *unicode_pos, u4 unicode_length, bool isclassname)
 457 {
 458         char *buffer;                   /* memory buffer for  unicode characters  */
 459         char *pos;                      /* pointer to current position in buffer  */
 460         u4 left;                        /* unicode characters left                */
 461         u4 buflength;                   /* utf length in bytes of the u2 array    */
 462         utf *result;                    /* resulting utf-string                   */
 463         int i;
 464
 465         /* determine utf length in bytes and allocate memory */
 466
 467         buflength = u2_utflength(unicode_pos, unicode_length);
 468         buffer    = MNEW(char, buflength);
 469
 470         left = buflength;
 471         pos  = buffer;
 472
 473         for (i = 0; i++ < unicode_length; unicode_pos++) {
 474                 /* next unicode character */
 475                 u2 c = *unicode_pos;
 476
 477                 if ((c != 0) && (c < 0x80)) {
 478                         /* 1 character */
 479                         left--;
 480                 if ((int) left < 0) break;
 481                         /* convert classname */
 482                         if (isclassname && c == '.')
 483                                 *pos++ = '/';
 484                         else
 485                                 *pos++ = (char) c;
 486
 487                 } else if (c < 0x800) {
 488                         /* 2 characters */
 489                 unsigned char high = c >> 6;
 490                 unsigned char low  = c & 0x3F;
 491                         left = left - 2;
 492                 if ((int) left < 0) break;
 493                 *pos++ = high | 0xC0;
 494                 *pos++ = low  | 0x80;
 495
 496                 } else {
 497                 /* 3 characters */
 498                 char low  = c & 0x3f;
 499                 char mid  = (c >> 6) & 0x3F;
 500                 char high = c >> 12;
 501                         left = left - 3;
 502                 if ((int) left < 0) break;
 503                 *pos++ = high | 0xE0;
 504                 *pos++ = mid  | 0x80;
 505                 *pos++ = low  | 0x80;
 506                 }
 507         }
 508
 509         /* insert utf-string into symbol-table */
 510         result = utf_new(buffer,buflength);
 511
 512         MFREE(buffer, char, buflength);
 513
 514         return result;
 515 }
 516
 517
 518 /* utf_new_char ****************************************************************
 519
 520    Creates a new utf symbol, the text for this symbol is passed as a
 521    c-string ( = char* ).
 522
 523 *******************************************************************************/
 524
 525 utf *utf_new_char(const char *text)
 526 {
 527         return utf_new(text, strlen(text));
 528 }
 529
 530
 531 /* utf_new_char_classname ******************************************************
 532
 533    Creates a new utf symbol, the text for this symbol is passed as a
 534    c-string ( = char* ) "." characters are going to be replaced by
 535    "/". Since the above function is used often, this is a separte
 536    function, instead of an if.
 537
 538 *******************************************************************************/
 539
 540 utf *utf_new_char_classname(const char *text)
 541 {
 542         if (strchr(text, '.')) {
 543                 char *txt = strdup(text);
 544                 char *end = txt + strlen(txt);
 545                 char *c;
 546                 utf *tmpRes;
 547
 548                 for (c = txt; c < end; c++)
 549                         if (*c == '.') *c = '/';
 550
 551                 tmpRes = utf_new(txt, strlen(txt));
 552                 FREE(txt, 0);
 553
 554                 return tmpRes;
 555
 556         } else
 557                 return utf_new(text, strlen(text));
 558 }
 559
 560
 561 /* utf_nextu2 ******************************************************************
 562
 563    Read the next unicode character from the utf string and increment
 564    the utf-string pointer accordingly.
 565
 566 *******************************************************************************/
 567
 568 u2 utf_nextu2(char **utf_ptr)
 569 {
 570     /* uncompressed unicode character */
 571     u2 unicode_char = 0;
 572     /* current position in utf text */
 573     unsigned char *utf = (unsigned char *) (*utf_ptr);
 574     /* bytes representing the unicode character */
 575     unsigned char ch1, ch2, ch3;
 576     /* number of bytes used to represent the unicode character */
 577     int len = 0;
 578
 579     switch ((ch1 = utf[0]) >> 4) {
 580         default: /* 1 byte */
 581                 (*utf_ptr)++;
 582                 return (u2) ch1;
 583         case 0xC:
 584         case 0xD: /* 2 bytes */
 585                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 586                         unsigned char high = ch1 & 0x1F;
 587                         unsigned char low  = ch2 & 0x3F;
 588                         unicode_char = (high << 6) + low;
 589                         len = 2;
 590                 }
 591                 break;
 592
 593         case 0xE: /* 2 or 3 bytes */
 594                 if (((ch2 = utf[1]) & 0xC0) == 0x80) {
 595                         if (((ch3 = utf[2]) & 0xC0) == 0x80) {
 596                                 unsigned char low  = ch3 & 0x3f;
 597                                 unsigned char mid  = ch2 & 0x3f;
 598                                 unsigned char high = ch1 & 0x0f;
 599                                 unicode_char = (((high << 6) + mid) << 6) + low;
 600                                 len = 3;
 601                         } else
 602                                 len = 2;
 603                 }
 604                 break;
 605     }
 606
 607     /* update position in utf-text */
 608     *utf_ptr = (char *) (utf + len);
 609
 610     return unicode_char;
 611 }
 612
 613
 614 /* utf_strlen ******************************************************************
 615
 616    Determine number of unicode characters in the utf string.
 617
 618 *******************************************************************************/
 619
 620 u4 utf_strlen(utf *u)
 621 {
 622         char *endpos;                       /* points behind utf string           */
 623         char *utf_ptr;                      /* current position in utf text       */
 624         u4 len = 0;                         /* number of unicode characters       */
 625
 626         if (!u) {
 627                 *exceptionptr = new_nullpointerexception();
 628                 return 0;
 629         }
 630
 631         endpos = utf_end(u);
 632         utf_ptr = u->text;
 633
 634         while (utf_ptr < endpos) {
 635                 len++;
 636                 /* next unicode character */
 637                 utf_nextu2(&utf_ptr);
 638         }
 639
 640         if (utf_ptr != endpos)
 641                 /* string ended abruptly */
 642                 throw_cacao_exception_exit(string_java_lang_InternalError,
 643                                                                    "Illegal utf8 string");
 644
 645         return len;
 646 }
 647
 648
 649 /* u2_utflength ****************************************************************
 650
 651    Returns the utf length in bytes of a u2 array.
 652
 653 *******************************************************************************/
 654
 655 u4 u2_utflength(u2 *text, u4 u2_length)
 656 {
 657         u4 result_len = 0;                  /* utf length in bytes                */
 658         u2 ch;                              /* current unicode character          */
 659         u4 len;
 660
 661         for (len = 0; len < u2_length; len++) {
 662                 /* next unicode character */
 663                 ch = *text++;
 664
 665                 /* determine bytes required to store unicode character as utf */
 666                 if (ch && (ch < 0x80))
 667                         result_len++;
 668                 else if (ch < 0x800)
 669                         result_len += 2;
 670                 else
 671                         result_len += 3;
 672         }
 673
 674     return result_len;
 675 }
 676
 677
 678 /* utf_display *****************************************************************
 679
 680    Write utf symbol to stdout (for debugging purposes).
 681
 682 *******************************************************************************/
 683
 684 void utf_display(utf *u)
 685 {
 686         char *endpos;                       /* points behind utf string           */
 687         char *utf_ptr;                      /* current position in utf text       */
 688
 689         if (!u) {
 690                 printf("NULL");
 691                 fflush(stdout);
 692                 return;
 693         }
 694
 695         endpos = utf_end(u);
 696         utf_ptr = u->text;
 697
 698         while (utf_ptr < endpos) {
 699                 /* read next unicode character */
 700                 u2 c = utf_nextu2(&utf_ptr);
 701                 if (c >= 32 && c <= 127) printf("%c", c);
 702                 else printf("?");
 703         }
 704
 705         fflush(stdout);
 706 }
 707
 708
 709 /* utf_display_classname *******************************************************
 710
 711    Write utf symbol to stdout with `/' converted to `.' (for debugging
 712    purposes).
 713
 714 *******************************************************************************/
 715
 716 void utf_display_classname(utf *u)
 717 {
 718         char *endpos;                       /* points behind utf string           */
 719         char *utf_ptr;                      /* current position in utf text       */
 720
 721         if (!u) {
 722                 printf("NULL");
 723                 fflush(stdout);
 724                 return;
 725         }
 726
 727         endpos = utf_end(u);
 728         utf_ptr = u->text;
 729
 730         while (utf_ptr < endpos) {
 731                 /* read next unicode character */
 732                 u2 c = utf_nextu2(&utf_ptr);
 733                 if (c == '/') c = '.';
 734                 if (c >= 32 && c <= 127) printf("%c", c);
 735                 else printf("?");
 736         }
 737
 738         fflush(stdout);
 739 }
 740
 741
 742 /* utf_sprint ******************************************************************
 743
 744    Write utf symbol into c-string (for debugging purposes).
 745
 746 *******************************************************************************/
 747
 748 void utf_sprint(char *buffer, utf *u)
 749 {
 750         char *endpos;                       /* points behind utf string           */
 751         char *utf_ptr;                      /* current position in utf text       */
 752         u2 pos = 0;                         /* position in c-string               */
 753
 754         if (!u) {
 755                 memcpy(buffer, "NULL", 5);      /* 4 chars + terminating \0           */
 756                 return;
 757         }
 758
 759         endpos = utf_end(u);
 760         utf_ptr = u->text;
 761
 762         while (utf_ptr < endpos)
 763                 /* copy next unicode character */
 764                 buffer[pos++] = utf_nextu2(&utf_ptr);
 765
 766         /* terminate string */
 767         buffer[pos] = '\0';
 768 }
 769
 770
 771 /* utf_sprint_classname ********************************************************
 772
 773    Write utf symbol into c-string with `/' converted to `.' (for debugging
 774    purposes).
 775
 776 *******************************************************************************/
 777
 778 void utf_sprint_classname(char *buffer, utf *u)
 779 {
 780         char *endpos;                       /* points behind utf string           */
 781         char *utf_ptr;                      /* current position in utf text       */
 782         u2 pos = 0;                         /* position in c-string               */
 783
 784         if (!u) {
 785                 memcpy(buffer, "NULL", 5);      /* 4 chars + terminating \0           */
 786                 return;
 787         }
 788
 789         endpos = utf_end(u);
 790         utf_ptr = u->text;
 791
 792         while (utf_ptr < endpos) {
 793                 /* copy next unicode character */
 794                 u2 c = utf_nextu2(&utf_ptr);
 795                 if (c == '/') c = '.';
 796                 buffer[pos++] = c;
 797         }
 798
 799         /* terminate string */
 800         buffer[pos] = '\0';
 801 }
 802
 803
 804 /* utf_fprint ******************************************************************
 805
 806    Write utf symbol into file.
 807
 808 *******************************************************************************/
 809
 810 void utf_fprint(FILE *file, utf *u)
 811 {
 812         char *endpos;                       /* points behind utf string           */
 813         char *utf_ptr;                      /* current position in utf text       */
 814
 815         if (!u)
 816                 return;
 817
 818         endpos = utf_end(u);
 819         utf_ptr = u->text;
 820
 821         while (utf_ptr < endpos) {
 822                 /* read next unicode character */
 823                 u2 c = utf_nextu2(&utf_ptr);
 824
 825                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 826                 else fprintf(file, "?");
 827         }
 828 }
 829
 830
 831 /* utf_fprint_classname ********************************************************
 832
 833    Write utf symbol into file with `/' converted to `.'.
 834
 835 *******************************************************************************/
 836
 837 void utf_fprint_classname(FILE *file, utf *u)
 838 {
 839         char *endpos;                       /* points behind utf string           */
 840         char *utf_ptr;                      /* current position in utf text       */
 841
 842     if (!u)
 843                 return;
 844
 845         endpos = utf_end(u);
 846         utf_ptr = u->text;
 847
 848         while (utf_ptr < endpos) {
 849                 /* read next unicode character */
 850                 u2 c = utf_nextu2(&utf_ptr);
 851                 if (c == '/') c = '.';
 852
 853                 if (c >= 32 && c <= 127) fprintf(file, "%c", c);
 854                 else fprintf(file, "?");
 855         }
 856 }
 857
 858
 859 /* is_valid_utf ****************************************************************
 860
 861    Return true if the given string is a valid UTF-8 string.
 862
 863    utf_ptr...points to first character
 864    end_pos...points after last character
 865
 866 *******************************************************************************/
 867
 868 static unsigned long min_codepoint[6] = {0,1L<<7,1L<<11,1L<<16,1L<<21,1L<<26};
 869
 870 bool is_valid_utf(char *utf_ptr, char *end_pos)
 871 {
 872         int bytes;
 873         int len,i;
 874         char c;
 875         unsigned long v;
 876
 877         if (end_pos < utf_ptr) return false;
 878         bytes = end_pos - utf_ptr;
 879         while (bytes--) {
 880                 c = *utf_ptr++;
 881
 882                 if (!c) return false;                     /* 0x00 is not allowed */
 883                 if ((c & 0x80) == 0) continue;            /* ASCII */
 884
 885                 if      ((c & 0xe0) == 0xc0) len = 1;     /* 110x xxxx */
 886                 else if ((c & 0xf0) == 0xe0) len = 2;     /* 1110 xxxx */
 887                 else if ((c & 0xf8) == 0xf0) len = 3;     /* 1111 0xxx */
 888                 else if ((c & 0xfc) == 0xf8) len = 4;     /* 1111 10xx */
 889                 else if ((c & 0xfe) == 0xfc) len = 5;     /* 1111 110x */
 890                 else return false;                        /* invalid leading byte */
 891
 892                 if (len > 2) return false;                /* Java limitation */
 893
 894                 v = (unsigned long)c & (0x3f >> len);
 895
 896                 if ((bytes -= len) < 0) return false;     /* missing bytes */
 897
 898                 for (i = len; i--; ) {
 899                         c = *utf_ptr++;
 900                         if ((c & 0xc0) != 0x80)               /* 10xx xxxx */
 901                                 return false;
 902                         v = (v << 6) | (c & 0x3f);
 903                 }
 904
 905                 if (v == 0) {
 906                         if (len != 1) return false;           /* Java special */
 907
 908                 } else {
 909                         /* Sun Java seems to allow overlong UTF-8 encodings */
 910
 911                         if (v < min_codepoint[len]) { /* overlong UTF-8 */
 912                                 if (!opt_liberalutf)
 913                                         fprintf(stderr,"WARNING: Overlong UTF-8 sequence found.\n");
 914                                 /* XXX change this to panic? */
 915                         }
 916                 }
 917
 918                 /* surrogates in UTF-8 seem to be allowed in Java classfiles */
 919                 /* if (v >= 0xd800 && v <= 0xdfff) return false; */ /* surrogates */
 920
 921                 /* even these seem to be allowed */
 922                 /* if (v == 0xfffe || v == 0xffff) return false; */ /* invalid codepoints */
 923         }
 924
 925         return true;
 926 }
 927
 928
 929 /* is_valid_name ***************************************************************
 930
 931    Return true if the given string may be used as a class/field/method
 932    name. (Currently this only disallows empty strings and control
 933    characters.)
 934
 935    NOTE: The string is assumed to have passed is_valid_utf!
 936
 937    utf_ptr...points to first character
 938    end_pos...points after last character
 939
 940 *******************************************************************************/
 941
 942 bool is_valid_name(char *utf_ptr, char *end_pos)
 943 {
 944         if (end_pos <= utf_ptr) return false; /* disallow empty names */
 945
 946         while (utf_ptr < end_pos) {
 947                 unsigned char c = *utf_ptr++;
 948
 949                 if (c < 0x20) return false; /* disallow control characters */
 950                 if (c == 0xc0 && (unsigned char) *utf_ptr == 0x80)  /* disallow zero */
 951                         return false;
 952         }
 953
 954         return true;
 955 }
 956
 957 bool is_valid_name_utf(utf *u)
 958 {
 959         return is_valid_name(u->text,utf_end(u));
 960 }
 961
 962
 963 /* utf_show ********************************************************************
 964
 965    Writes the utf symbols in the utfhash to stdout and displays the
 966    number of external hash chains grouped according to the chainlength
 967    (for debugging purposes).
 968
 969 *******************************************************************************/
 970
 971 void utf_show(void)
 972 {
 973
 974 #define CHAIN_LIMIT 20               /* limit for seperated enumeration */
 975
 976         u4 chain_count[CHAIN_LIMIT]; /* numbers of chains */
 977         u4 max_chainlength = 0;      /* maximum length of the chains */
 978         u4 sum_chainlength = 0;      /* sum of the chainlengths */
 979         u4 beyond_limit = 0;         /* number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
 980         u4 i;
 981
 982         printf ("UTF-HASH:\n");
 983
 984         /* show element of utf-hashtable */
 985         for (i=0; i<utf_hash.size; i++) {
 986                 utf *u = utf_hash.ptr[i];
 987                 if (u) {
 988                         printf ("SLOT %d: ", (int) i);
 989                         while (u) {
 990                                 printf ("'");
 991                                 utf_display (u);
 992                                 printf ("' ");
 993                                 u = u->hashlink;
 994                         }
 995                         printf ("\n");
 996                 }
 997
 998         }
 999
1000         printf ("UTF-HASH: %d slots for %d entries\n",
1001                         (int) utf_hash.size, (int) utf_hash.entries );
1002
1003
1004         if (utf_hash.entries == 0)
1005                 return;
1006
1007         printf("chains:\n  chainlength    number of chains    %% of utfstrings\n");
1008
1009         for (i=0;i<CHAIN_LIMIT;i++)
1010                 chain_count[i]=0;
1011
1012         /* count numbers of hashchains according to their length */
1013         for (i=0; i<utf_hash.size; i++) {
1014
1015                 utf *u = (utf*) utf_hash.ptr[i];
1016                 u4 chain_length = 0;
1017
1018                 /* determine chainlength */
1019                 while (u) {
1020                         u = u->hashlink;
1021                         chain_length++;
1022                 }
1023
1024                 /* update sum of all chainlengths */
1025                 sum_chainlength+=chain_length;
1026
1027                 /* determine the maximum length of the chains */
1028                 if (chain_length>max_chainlength)
1029                         max_chainlength = chain_length;
1030
1031                 /* update number of utf-symbols in chains with length>=CHAIN_LIMIT-1 */
1032                 if (chain_length>=CHAIN_LIMIT) {
1033                         beyond_limit+=chain_length;
1034                         chain_length=CHAIN_LIMIT-1;
1035                 }
1036
1037                 /* update number of hashchains of current length */
1038                 chain_count[chain_length]++;
1039         }
1040
1041         /* display results */
1042         for (i=1;i<CHAIN_LIMIT-1;i++)
1043                 printf("       %2d %17d %18.2f%%\n",i,chain_count[i],(((float) chain_count[i]*i*100)/utf_hash.entries));
1044
1045         printf("     >=%2d %17d %18.2f%%\n",CHAIN_LIMIT-1,chain_count[CHAIN_LIMIT-1],((float) beyond_limit*100)/utf_hash.entries);
1046
1047
1048         printf("max. chainlength:%5d\n",max_chainlength);
1049
1050         /* avg. chainlength = sum of chainlengths / number of chains */
1051         printf("avg. chainlength:%5.2f\n",(float) sum_chainlength / (utf_hash.size-chain_count[0]));
1052 }
1053
1054
1055 /*
1056  * These are local overrides for various environment variables in Emacs.
1057  * Please do not remove this and leave it at the end of the file, where
1058  * Emacs will automagically detect them.
1059  * ---------------------------------------------------------------------
1060  * Local variables:
1061  * mode: c
1062  * indent-tabs-mode: t
1063  * c-basic-offset: 4
1064  * tab-width: 4
1065  * End:
1066  */