* src/vm/utf8.h, src/vm/utf8.c (utf8_safe_number_of_u2s): New function.

author edwin <none@none>

Tue, 24 Oct 2006 16:41:54 +0000 (16:41 +0000)

committer edwin <none@none>

Tue, 24 Oct 2006 16:41:54 +0000 (16:41 +0000)
author edwin <none@none>
Tue, 24 Oct 2006 16:41:54 +0000 (16:41 +0000)
committer edwin <none@none>
Tue, 24 Oct 2006 16:41:54 +0000 (16:41 +0000)
diff --git a/src/native/jni.c b/src/native/jni.c

index e74dfb7c953ef6fa11446f75b0a9d5896a5e0563..8f5be412f1bdadaf674c836b0058d0d1e913fda7 100644 (file)
--- a/src/native/jni.c
+++ b/src/native/jni.c
@@ -32,7 +32,7 @@
              Christian Thalinger
                         Edwin Steiner
  
-   $Id: jni.c 5779 2006-10-14 19:12:58Z twisti $
+   $Id: jni.c 5821 2006-10-24 16:41:54Z edwin $
  
  */
  
@@ -4084,7 +4084,7 @@ jstring _Jv_JNI_NewStringUTF(JNIEnv *env, const char *bytes)
  
         STATISTICS(jniinvokation());
  
-       s = javastring_new(utf_new_char(bytes));
+       s = javastring_safe_new_from_utf8(bytes);
  
      return (jstring) _Jv_JNI_NewLocalRef(env, (jobject) s);
  }
diff --git a/src/vm/string.c b/src/vm/string.c

index b6b075175ed7987686ea870dbe702c09edf43712..d11a7d340d0113ec72b5e2b50598a822ca181ff1 100644 (file)
--- a/src/vm/string.c
+++ b/src/vm/string.c
@@ -31,7 +31,7 @@
     Changes: Christian Thalinger
                         Edwin Steiner
  
-   $Id: string.c 5123 2006-07-12 21:45:34Z twisti $
+   $Id: string.c 5821 2006-10-24 16:41:54Z edwin $
  
  */
  
@@ -322,6 +322,57 @@ java_lang_String *javastring_new_from_utf_buffer(const char *buffer, u4 blength)
  }
  
  
+/* javastring_safe_new_from_utf8 ***********************************************
+
+   Create a new object of type java/lang/String with the text from
+   the specified UTF-8 string. This function is safe for invalid UTF-8.
+   (Invalid characters will be replaced by U+fffd.)
+
+   IN:
+      text.........the UTF-8 string, zero-terminated.
+
+   RETURN VALUE:
+      the java.lang.String object, or
+      NULL if an exception has been thrown
+
+*******************************************************************************/
+
+java_lang_String *javastring_safe_new_from_utf8(const char *text)
+{
+       java_lang_String *s;            /* result-string                          */
+       java_chararray *a;
+       s4 len;
+
+       assert(text);
+
+       /* calculate number of Java characters */
+
+       len = utf8_safe_number_of_u2s(text);
+
+       /* allocate the String object and the char array */
+
+       s = (java_lang_String *) builtin_new(class_java_lang_String);
+       a = builtin_newarray_char(len);
+
+       /* javastring or character-array could not be created? */
+
+       if (!a || !s)
+               return NULL;
+
+       /* decompress UTF-8 string */
+
+       utf8_safe_convert_to_u2s(text, a->data);
+
+       /* set fields of the String object */
+
+       s->value  = a;
+       s->offset = 0;
+       s->count  = len;
+
+       return s;
+}
+
+
  /* javastring_new_from_utf_string **********************************************
  
     Create a new object of type java/lang/String with the text from
diff --git a/src/vm/stringlocal.h b/src/vm/stringlocal.h

index ce9b5e1c4bd17e67c7f935145117a4a8f8e0f0e9..99fee6bda7c13462cadc6fda6cc6d158098ffd00 100644 (file)
--- a/src/vm/stringlocal.h
+++ b/src/vm/stringlocal.h
@@ -28,7 +28,7 @@
  
     Changes: Edwin Steiner
  
-   $Id: stringlocal.h 4876 2006-05-05 15:26:04Z edwin $
+   $Id: stringlocal.h 5821 2006-10-24 16:41:54Z edwin $
  
  */
  
@@ -134,6 +134,9 @@ java_lang_String *javastring_new_from_ascii(const char *text);
  java_lang_String *javastring_new_from_utf_buffer(const char *buffer, u4 blength);
  java_lang_String *javastring_new_from_utf_string(const char *utfstr);
  
+/* creates a new object of type java/lang/String from (possibly invalid) UTF-8 */
+java_lang_String *javastring_safe_new_from_utf8(const char *text);
+
  /* make c-string from a javastring (debugging) */
  char *javastring_tochar(java_objectheader *s);
  
diff --git a/src/vm/utf8.c b/src/vm/utf8.c

index 62df9dc646754bd29240fdf19e533e043efc4556..7c786199756916b833ffe97bcf341e9b316b9808 100644 (file)
--- a/src/vm/utf8.c
+++ b/src/vm/utf8.c
@@ -31,7 +31,7 @@
              Christian Thalinger
                         Edwin Steiner
  
-   $Id: utf8.c 5697 2006-10-05 17:23:48Z twisti $
+   $Id: utf8.c 5821 2006-10-24 16:41:54Z edwin $
  
  */
  
@@ -737,6 +737,9 @@ utf *utf_new_char_classname(const char *text)
     Read the next unicode character from the utf string and increment
     the utf-string pointer accordingly.
  
+   CAUTION: This function is unsafe for input that was not checked 
+            by is_valid_utf!
+
  *******************************************************************************/
  
  u2 utf_nextu2(char **utf_ptr)
@@ -807,6 +810,9 @@ u4 utf_bytes(utf *u)
  
     Determine number of UTF-16 u2s in the given UTF-8 buffer
  
+   CAUTION: This function is unsafe for input that was not checked 
+            by is_valid_utf!
+
     CAUTION: Use this function *only* when you want to convert an UTF-8 buffer
     to an array of u2s (UTF-16) and want to know how many of them you will get.
     All other uses of this function are probably wrong.
@@ -849,6 +855,9 @@ u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength)
  
     Determine number of UTF-16 u2s in the utf string.
  
+   CAUTION: This function is unsafe for input that was not checked 
+            by is_valid_utf!
+
     CAUTION: Use this function *only* when you want to convert a utf string
     to an array of u2s and want to know how many of them you will get.
     All other uses of this function are probably wrong.
@@ -894,6 +903,262 @@ u4 utf_get_number_of_u2s(utf *u)
  }
  
  
+/* utf8_safe_number_of_u2s *****************************************************
+
+   Determine number of UTF-16 u2s needed for decoding the given UTF-8 string.
+   (For invalid UTF-8 the U+fffd replacement character will be counted.)
+
+   This function is safe even for invalid UTF-8 strings.
+
+   IN:
+      text..........zero-terminated UTF-8 string (may be invalid)
+                       must NOT be NULL
+
+   OUT:
+      the number of u2s needed to hold this string in UTF-16 encoding.
+         There is _no_ terminating zero included in this count.
+
+*******************************************************************************/
+
+s4 utf8_safe_number_of_u2s(const char *text) {
+       register const unsigned char *t;
+       register s4 byte;
+       register s4 len;
+       s4 byte1;
+       s4 byte2;
+       s4 byte3;
+       s4 value;
+       s4 skip;
+
+       assert(text);
+
+       len = 0;
+       t = (const unsigned char *) text;
+
+       /* CAUTION: Keep this code in sync with utf8_safe_convert_to_u2s! */
+
+       while (1) {
+               byte = *t++;
+
+               if (byte & 0x80) {
+                       /* highest bit set, non-ASCII character */
+
+                       if ((byte & 0xe0) == 0xc0) {
+                               /* 2-byte: should be 110..... 10...... ? */
+
+                               if ((*t++ & 0xc0) == 0x80)
+                                       ; /* valid 2-byte */
+                               else
+                                       t--; /* invalid */
+                       }
+                       else if ((byte & 0xf0) == 0xe0) {
+                               /* 3-byte: should be 1110.... 10...... 10...... */
+
+                               if ((*t++ & 0xc0) == 0x80) {
+                                       if ((*t++ & 0xc0) == 0x80)
+                                               ; /* valid 3-byte */
+                                       else
+                                               t--; /* invalid */
+                               }
+                               else
+                                       t--; /* invalid */
+                       }
+                       else if ((byte & 0xf8) == 0xf0) {
+                               /* 4-byte: should be 11110... 10...... 10...... 10...... */
+
+                               if (((byte1 = *t++) & 0xc0) == 0x80) {
+                                       if (((byte2 = *t++) & 0xc0) == 0x80) {
+                                               if (((byte3 = *t++) & 0xc0) == 0x80) {
+                                                       /* valid 4-byte UTF-8? */
+                                                       value = ((byte  & 0x07) << 18)
+                                                                 | ((byte1 & 0x3f) << 12)
+                                                                 | ((byte2 & 0x3f) <<  6)
+                                                                 | ((byte3 & 0x3f)      );
+
+                                                       if (value > 0x10FFFF)
+                                                               ; /* invalid */
+                                                       else if (value > 0xFFFF)
+                                                               len += 1; /* we need surrogates */
+                                                       else
+                                                               ; /* 16bit suffice */
+                                               }
+                                               else
+                                                       t--; /* invalid */
+                                       }
+                                       else
+                                               t--; /* invalid */
+                               }
+                               else
+                                       t--; /* invalid */
+                       }
+                       else if ((byte & 0xfc) == 0xf8) {
+                               /* invalid 5-byte */
+                               skip = 4;
+                               for (; skip && (*t & 0x80); --skip)
+                                       t++;
+                       }
+                       else if ((byte & 0xfe) == 0xfc) {
+                               /* invalid 6-byte */
+                               skip = 5;
+                               for (; skip && (*t & 0x80); --skip)
+                                       t++;
+                       }
+                       else
+                               ; /* invalid */
+               }
+               else {
+                       /* NUL */
+
+                       if (byte == 0)
+                               break;
+
+                       /* ASCII character, common case */
+               }
+
+               len++;
+       }
+
+       return len;
+}
+
+
+/* utf8_safe_convert_to_u2s ****************************************************
+
+   Convert the given UTF-8 string to UTF-16 into a pre-allocated buffer.
+   (Invalid UTF-8 will be replaced with the U+fffd replacement character.)
+   Use utf8_safe_number_of_u2s to determine the number of u2s to allocate.
+
+   This function is safe even for invalid UTF-8 strings.
+
+   IN:
+      text..........zero-terminated UTF-8 string (may be invalid)
+                       must NOT be NULL
+
+*******************************************************************************/
+
+#define UNICODE_REPLACEMENT  0xfffd
+
+void utf8_safe_convert_to_u2s(const char *text, u2 *buffer) {
+       register const unsigned char *t;
+       register s4 byte;
+       s4 byte1;
+       s4 byte2;
+       s4 byte3;
+       s4 value;
+       s4 skip;
+
+       assert(text);
+
+       t = (const unsigned char *) text;
+
+       /* CAUTION: Keep this code in sync with utf8_safe_number_of_u2s! */
+
+       while (1) {
+               byte = *t++;
+
+               if (byte & 0x80) {
+                       /* highest bit set, non-ASCII character */
+
+                       if ((byte & 0xe0) == 0xc0) {
+                               /* 2-byte: should be 110..... 10...... */
+
+                               if (((byte1 = *t++) & 0xc0) == 0x80) {
+                                       /* valid 2-byte UTF-8 */
+                                       *buffer++ = ((byte  & 0x1f) << 6)
+                                                         | ((byte1 & 0x3f)     );
+                               }
+                               else {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       t--;
+                               }
+                       }
+                       else if ((byte & 0xf0) == 0xe0) {
+                               /* 3-byte: should be 1110.... 10...... 10...... */
+
+                               if (((byte1 = *t++) & 0xc0) == 0x80) {
+                                       if (((byte2 = *t++) & 0xc0) == 0x80) {
+                                               /* valid 3-byte UTF-8 */
+                                               *buffer++ = ((byte  & 0x0f) << 12)
+                                                                 | ((byte1 & 0x3f) <<  6)
+                                                                 | ((byte2 & 0x3f)      );
+                                       }
+                                       else {
+                                               *buffer++ = UNICODE_REPLACEMENT;
+                                               t--;
+                                       }
+                               }
+                               else {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       t--;
+                               }
+                       }
+                       else if ((byte & 0xf8) == 0xf0) {
+                               /* 4-byte: should be 11110... 10...... 10...... 10...... */
+
+                               if (((byte1 = *t++) & 0xc0) == 0x80) {
+                                       if (((byte2 = *t++) & 0xc0) == 0x80) {
+                                               if (((byte3 = *t++) & 0xc0) == 0x80) {
+                                                       /* valid 4-byte UTF-8? */
+                                                       value = ((byte  & 0x07) << 18)
+                                                                 | ((byte1 & 0x3f) << 12)
+                                                                 | ((byte2 & 0x3f) <<  6)
+                                                                 | ((byte3 & 0x3f)      );
+
+                                                       if (value > 0x10FFFF) {
+                                                               *buffer++ = UNICODE_REPLACEMENT;
+                                                       }
+                                                       else if (value > 0xFFFF) {
+                                                               /* we need surrogates */
+                                                               *buffer++ = 0xd800 | ((value >> 10) - 0x40);
+                                                               *buffer++ = 0xdc00 | (value & 0x03ff);
+                                                       }
+                                                       else
+                                                               *buffer++ = value; /* 16bit suffice */
+                                               }
+                                               else {
+                                                       *buffer++ = UNICODE_REPLACEMENT;
+                                                       t--;
+                                               }
+                                       }
+                                       else {
+                                               *buffer++ = UNICODE_REPLACEMENT;
+                                               t--;
+                                       }
+                               }
+                               else {
+                                       *buffer++ = UNICODE_REPLACEMENT;
+                                       t--;
+                               }
+                       }
+                       else if ((byte & 0xfc) == 0xf8) {
+                               skip = 4;
+                               for (; skip && (*t & 0x80); --skip)
+                                       t++;
+                               *buffer++ = UNICODE_REPLACEMENT;
+                       }
+                       else if ((byte & 0xfe) == 0xfc) {
+                               skip = 5;
+                               for (; skip && (*t & 0x80); --skip)
+                                       t++;
+                               *buffer++ = UNICODE_REPLACEMENT;
+                       }
+                       else
+                               *buffer++ = UNICODE_REPLACEMENT;
+               }
+               else {
+                       /* NUL */
+
+                       if (byte == 0)
+                               break;
+
+                       /* ASCII character, common case */
+
+                       *buffer++ = byte;
+               }
+       }
+}
+
+
  /* u2_utflength ****************************************************************
  
     Returns the utf length in bytes of a u2 array.
diff --git a/src/vm/utf8.h b/src/vm/utf8.h

index ab937ba3feff3846e5dbbaf18803b548348742b7..e5c462b65382d0b478e2a6629f96743aaa7a2515 100644 (file)
--- a/src/vm/utf8.h
+++ b/src/vm/utf8.h
@@ -28,7 +28,7 @@
  
     Changes: Edwin Steiner
  
-   $Id: utf8.h 5697 2006-10-05 17:23:48Z twisti $
+   $Id: utf8.h 5821 2006-10-24 16:41:54Z edwin $
  
  */
  
@@ -192,7 +192,11 @@ u4 utf_bytes(utf *u);
  /* get next unicode character of a utf-string */
  u2 utf_nextu2(char **utf);
  
-/* get number of unicode characters of a utf string */
+/* get (number of) unicode characters of a utf string (safe) */
+s4 utf8_safe_number_of_u2s(const char *text);
+void utf8_safe_convert_to_u2s(const char *text, u2 *buffer);
+
+/* get (number of) unicode characters of a utf string (UNSAFE!) */
  u4 utf_get_number_of_u2s(utf *u);
  u4 utf_get_number_of_u2s_for_buffer(const char *buffer, u4 blength);
author	edwin <none@none>
	Tue, 24 Oct 2006 16:41:54 +0000 (16:41 +0000)
committer	edwin <none@none>
	Tue, 24 Oct 2006 16:41:54 +0000 (16:41 +0000)
src/native/jni.c		patch \| blob \| history
src/vm/string.c		patch \| blob \| history
src/vm/stringlocal.h		patch \| blob \| history
src/vm/utf8.c		patch \| blob \| history
src/vm/utf8.h		patch \| blob \| history