Implemented a bunch of unicode functions for eglib

author Jeffrey Stedfast <fejj@gnome.org>

Tue, 12 Apr 2011 20:04:46 +0000 (16:04 -0400)

committer Jeffrey Stedfast <fejj@gnome.org>

Tue, 12 Apr 2011 20:04:46 +0000 (16:04 -0400)
author Jeffrey Stedfast <fejj@gnome.org>
Tue, 12 Apr 2011 20:04:46 +0000 (16:04 -0400)
committer Jeffrey Stedfast <fejj@gnome.org>
Tue, 12 Apr 2011 20:04:46 +0000 (16:04 -0400)
diff --git a/eglib/src/eglib-remap.h b/eglib/src/eglib-remap.h

index b0a36e1cb46f30cf6facb9f34bffdc09a8c4f128..fc21c161b8b2d67359f3ac9c95e71c96a549e82e 100644 (file)
--- a/eglib/src/eglib-remap.h
+++ b/eglib/src/eglib-remap.h
@@ -180,6 +180,7 @@
  #define g_string_append monoeg_g_string_append
  #define g_string_append_c monoeg_g_string_append_c
  #define g_string_append_len monoeg_g_string_append_len
+#define g_string_append_unichar monoeg_g_string_append_unichar
  #define g_string_append_printf monoeg_g_string_append_printf
  #define g_string_free monoeg_g_string_free
  #define g_string_new monoeg_g_string_new
@@ -203,6 +204,7 @@
  #define g_timer_start monoeg_g_timer_start
  #define g_timer_stop monoeg_g_timer_stop
  #define g_trailingBytesForUTF8 monoeg_g_trailingBytesForUTF8
+#define g_ucs4_to_utf8 monoeg_g_ucs4_to_utf8
  #define g_ucs4_to_utf16 monoeg_g_ucs4_to_utf16
  #define g_unichar_case monoeg_g_unichar_case
  #define g_unichar_isxdigit monoeg_g_unichar_isxdigit
@@ -221,6 +223,7 @@
  #define g_utf8_strup monoeg_g_utf8_strup
  #define g_utf8_to_utf16 monoeg_g_utf8_to_utf16
  #define g_utf8_validate monoeg_g_utf8_validate
+#define g_unichar_to_utf8 monoeg_g_unichar_to_utf8
  #define g_win32_getlocale monoeg_g_win32_getlocale
  #define g_assertion_message monoeg_assertion_message
  #define g_malloc monoeg_malloc
diff --git a/eglib/src/glib.h b/eglib/src/glib.h

index 945e20e338ff57a132ff8de24d5aef04394f40fd..d2060c176f92751d1883d219bcc0b7a31ce9f7e6 100644 (file)
--- a/eglib/src/glib.h
+++ b/eglib/src/glib.h
@@ -72,18 +72,17 @@ typedef unsigned char  guchar;
  #if !G_TYPES_DEFINED
  /* VS 2010 and later have stdint.h */
  #if defined(_MSC_VER) && _MSC_VER < 1600
-typedef __int8                         gint8;
+typedef __int8                 gint8;
  typedef unsigned __int8                guint8;
-typedef __int16                                gint16;
+typedef __int16                        gint16;
  typedef unsigned __int16       guint16;
-typedef __int32                                gint32;
+typedef __int32                        gint32;
  typedef unsigned __int32       guint32;
-typedef __int64                                gint64;
+typedef __int64                        gint64;
  typedef unsigned __int64       guint64;
-typedef float                          gfloat;
-typedef double                         gdouble;
-typedef unsigned __int16       gunichar2;
-typedef int                 gboolean;
+typedef float                  gfloat;
+typedef double                 gdouble;
+typedef int                    gboolean;
  #else
  /* Types defined in terms of the stdint.h */
  typedef int8_t         gint8;
@@ -96,11 +95,13 @@ typedef int64_t        gint64;
  typedef uint64_t       guint64;
  typedef float          gfloat;
  typedef double         gdouble;
-typedef uint16_t       gunichar2;
  typedef int32_t        gboolean;
  #endif
  #endif
  
+typedef guint16 gunichar2;
+typedef guint32 gunichar;
+
  /*
   * Macros
   */
@@ -343,6 +344,7 @@ GString     *g_string_append        (GString *string, const gchar *val);
  void         g_string_printf        (GString *string, const gchar *format, ...);
  void         g_string_append_printf (GString *string, const gchar *format, ...);
  void         g_string_append_vprintf (GString *string, const gchar *format, va_list args);
+GString     *g_string_append_unichar (GString *string, gunichar c);
  GString     *g_string_append_c      (GString *string, gchar c);
  GString     *g_string_append        (GString *string, const gchar *val);
  GString     *g_string_append_len    (GString *string, const gchar *val, gssize len);
@@ -603,7 +605,6 @@ gpointer g_convert_error_quark(void);
   * only used if the old collation code is activated, so this is only the
   * bare minimum to build.
   */
-typedef guint32 gunichar;
  
  typedef enum {
         G_UNICODE_CONTROL,
@@ -683,12 +684,14 @@ typedef enum {
         G_CONVERT_ERROR_NOT_ABSOLUTE_PATH
  } GConvertError;
  
-gchar* g_utf8_strup (const gchar *str, gssize len);
-gchar* g_utf8_strdown (const gchar *str, gssize len);
+gchar     *g_utf8_strup (const gchar *str, gssize len);
+gchar     *g_utf8_strdown (const gchar *str, gssize len);
+gint       g_unichar_to_utf8 (gunichar c, gchar *outbuf);
  gunichar2 *g_utf8_to_utf16 (const gchar *str, glong len, glong *items_read, glong *items_written, GError **error);
  gchar     *g_utf16_to_utf8 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error);
-gunichar2 *g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error);
  gunichar  *g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *items_written, GError **error);
+gchar     *g_ucs4_to_utf8  (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error);
+gunichar2 *g_ucs4_to_utf16 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error);
  
  #define u8to16(str) g_utf8_to_utf16(str, (glong)strlen(str), NULL, NULL, NULL)
  
diff --git a/eglib/src/gstring.c b/eglib/src/gstring.c

index b295756b1f8f9bc46fe68fbb31b712f5097e05a7..7aecb6234f85ee4c210d150b3d8b158aea484ebb 100644 (file)
--- a/eglib/src/gstring.c
+++ b/eglib/src/gstring.c
@@ -132,6 +132,20 @@ g_string_append_c (GString *string, gchar c)
         return string;
  }
  
+GString *
+g_string_append_unichar (GString *string, gunichar c)
+{
+       gchar utf8[6];
+       gint len;
+       
+       g_return_val_if_fail (string != NULL, NULL);
+       
+       if ((len = g_unichar_to_utf8 (c, utf8)) <= 0)
+               return string;
+       
+       return g_string_append_len (string, utf8, len);
+}
+
  GString *
  g_string_prepend (GString *string, const gchar *val)
  {
diff --git a/eglib/src/gutf8.c b/eglib/src/gutf8.c

index fed6dd753ca6921ff34a30fa99b43c67906853e1..c9cd375bfe21b414c116919a26c41d756067318b 100644 (file)
--- a/eglib/src/gutf8.c
+++ b/eglib/src/gutf8.c
@@ -450,6 +450,63 @@ utf16_to_utf8_len (const gunichar2 *str, glong len, glong *items_read, GError **
         return ret;
  }
  
+gchar *
+g_ucs4_to_utf8 (const gunichar *str, glong len, glong *items_read, glong *items_written, GError **error)
+{
+       gchar *outbuf, *outptr;
+       glong nwritten = 0;
+       glong i;
+       gint n;
+       
+       if (len == -1) {
+               for (i = 0; str[i] != 0; i++) {
+                       if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
+                               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+                                            "Invalid sequence in conversion input");
+                               
+                               if (items_read)
+                                       *items_read = i;
+                               
+                               return NULL;
+                       }
+                       
+                       nwritten += n;
+               }
+       } else {
+               for (i = 0; i < len; i++) {
+                       if ((n = g_unichar_to_utf8 (str[i], NULL)) < 0) {
+                               g_set_error (error, G_CONVERT_ERROR, G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+                                            "Invalid sequence in conversion input");
+                               
+                               if (items_read)
+                                       *items_read = i;
+                               
+                               return NULL;
+                       }
+                       
+                       nwritten += n;
+               }
+       }
+       
+       outptr = outbuf = g_malloc (nwritten + 1);
+       if (len == -1) {
+               for (i = 0; str[i] != 0; i++)
+                       outptr += g_unichar_to_utf8 (str[i], outptr);
+       } else {
+               for (i = 0; i < len; i++)
+                       outptr += g_unichar_to_utf8 (str[i], outptr);
+       }
+       *outptr = '\0';
+       
+       if (items_written)
+               *items_written = nwritten;
+       
+       if (items_read != 0)
+               *items_read = i;
+       
+       return outbuf;
+}
+
  static glong
  g_ucs4_to_utf16_len (const gunichar *str, glong len, glong *items_read, GError **error)
  {
@@ -617,3 +674,82 @@ g_utf16_to_ucs4 (const gunichar2 *str, glong len, glong *items_read, glong *item
  
         return retstr;
  }
+
+/**
+ * from http://home.tiscali.nl/t876506/utf8tbl.html
+ *
+ * From Unicode UCS-4 to UTF-8:
+ * Start with the Unicode number expressed as a decimal number and call this ud.
+ *
+ * If ud <128 (7F hex) then UTF-8 is 1 byte long, the value of ud.
+ *
+ * If ud >=128 and <=2047 (7FF hex) then UTF-8 is 2 bytes long.
+ *    byte 1 = 192 + (ud div 64)
+ *    byte 2 = 128 + (ud mod 64)
+ *
+ * If ud >=2048 and <=65535 (FFFF hex) then UTF-8 is 3 bytes long.
+ *    byte 1 = 224 + (ud div 4096)
+ *    byte 2 = 128 + ((ud div 64) mod 64)
+ *    byte 3 = 128 + (ud mod 64)
+ *
+ * If ud >=65536 and <=2097151 (1FFFFF hex) then UTF-8 is 4 bytes long.
+ *    byte 1 = 240 + (ud div 262144)
+ *    byte 2 = 128 + ((ud div 4096) mod 64)
+ *    byte 3 = 128 + ((ud div 64) mod 64)
+ *    byte 4 = 128 + (ud mod 64)
+ *
+ * If ud >=2097152 and <=67108863 (3FFFFFF hex) then UTF-8 is 5 bytes long.
+ *    byte 1 = 248 + (ud div 16777216)
+ *    byte 2 = 128 + ((ud div 262144) mod 64)
+ *    byte 3 = 128 + ((ud div 4096) mod 64)
+ *    byte 4 = 128 + ((ud div 64) mod 64)
+ *    byte 5 = 128 + (ud mod 64)
+ *
+ * If ud >=67108864 and <=2147483647 (7FFFFFFF hex) then UTF-8 is 6 bytes long.
+ *    byte 1 = 252 + (ud div 1073741824)
+ *    byte 2 = 128 + ((ud div 16777216) mod 64)
+ *    byte 3 = 128 + ((ud div 262144) mod 64)
+ *    byte 4 = 128 + ((ud div 4096) mod 64)
+ *    byte 5 = 128 + ((ud div 64) mod 64)
+ *    byte 6 = 128 + (ud mod 64)
+ **/
+gint
+g_unichar_to_utf8 (gunichar c, gchar *outbuf)
+{
+       gint len, i;
+       char base;
+       
+       if (c < 128UL) {
+               base = 0;
+               len = 1;
+       } else if (c < 2048UL) {
+               base = 192;
+               len = 2;
+       } else if (c < 65536UL) {
+               base = 224;
+               len = 3;
+       } else if (c < 2097152UL) {
+               base = 240;
+               len = 4;
+       } else if (c < 67108864UL) {
+               base = 248;     
+               len = 5;
+       } else if (c < 2147483648UL) {
+               base = 252;
+               len = 6;
+       } else
+               return -1;
+       
+       if (outbuf != NULL) {
+               for (i = len - 1; i > 0; i--) {
+                       /* mask off 6 bits worth and add 128 */
+                       outbuf[i] = 128 + (c & 0x3f);
+                       c >>= 6;
+               }
+               
+               /* first character has a different base */
+               outbuf[0] = base + (c & 0x3f);
+       }
+       
+       return len;
+}
author	Jeffrey Stedfast <fejj@gnome.org>
	Tue, 12 Apr 2011 20:04:46 +0000 (16:04 -0400)
committer	Jeffrey Stedfast <fejj@gnome.org>
	Tue, 12 Apr 2011 20:04:46 +0000 (16:04 -0400)
eglib/src/eglib-remap.h		patch \| blob \| history
eglib/src/glib.h		patch \| blob \| history
eglib/src/gstring.c		patch \| blob \| history
eglib/src/gutf8.c		patch \| blob \| history