eglib/src/gunicode.c

   1 /*
   2  * gunicode.c: Some Unicode routines
   3  *
   4  * Author:
   5  *   Miguel de Icaza (miguel@novell.com)
   6  *
   7  * (C) 2006 Novell, Inc.
   8  *
   9  * utf8 validation code came from:
  10  *      libxml2-2.6.26 licensed under the MIT X11 license
  11  *
  12  * Authors credit in libxml's string.c:
  13  *   William Brack <wbrack@mmm.com.hk>
  14  *   daniel@veillard.com
  15  *
  16  * Permission is hereby granted, free of charge, to any person obtaining
  17  * a copy of this software and associated documentation files (the
  18  * "Software"), to deal in the Software without restriction, including
  19  * without limitation the rights to use, copy, modify, merge, publish,
  20  * distribute, sublicense, and/or sell copies of the Software, and to
  21  * permit persons to whom the Software is furnished to do so, subject to
  22  * the following conditions:
  23  *
  24  * The above copyright notice and this permission notice shall be
  25  * included in all copies or substantial portions of the Software.
  26  *
  27  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  28  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  29  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  30  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  31  * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  32  * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  33  * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  34  *
  35  */
  36 #include <stdio.h>
  37 #include <glib.h>
  38 #include <errno.h>
  39 #ifdef _MSC_VER
  40 /* FIXME */
  41 #define CODESET 1
  42 typedef int iconv_t;
  43 #else
  44 #include <langinfo.h>
  45 #include <iconv.h>
  46 #endif
  47
  48 static char *my_charset;
  49 static gboolean is_utf8;
  50
  51 GUnicodeType
  52 g_unichar_type (gunichar c)
  53 {
  54         g_error ("%s", "g_unichar_type is not implemented");
  55         return 0;
  56 }
  57
  58 gunichar
  59 g_unichar_tolower (gunichar c)
  60 {
  61         g_error ("%s", "g_unichar_type is not implemented");
  62         return 0;
  63 }
  64
  65 gboolean
  66 g_unichar_isxdigit (gunichar c)
  67 {
  68         return (g_unichar_xdigit_value (c) != -1);
  69
  70 }
  71
  72 gint
  73 g_unichar_xdigit_value (gunichar c)
  74 {
  75         if (c >= 0x30 && c <= 0x39) /*0-9*/
  76                 return (c - 0x30);
  77         if (c >= 0x41 && c <= 0x46) /*A-F*/
  78                 return (c - 0x37);
  79         if (c >= 0x61 && c <= 0x66) /*a-f*/
  80                 return (c - 0x57);
  81         return -1;
  82 }
  83
  84 gchar *
  85 g_convert (const gchar *str, gssize len,
  86            const gchar *to_codeset, const gchar *from_codeset,
  87            gsize *bytes_read, gsize *bytes_written, GError **error)
  88 {
  89         char *result = NULL;
  90 #ifdef G_OS_WIN32
  91 #else
  92         iconv_t convertor;
  93         char *buffer, *output;
  94         const char *strptr = (const char *) str;
  95         size_t str_len = len == -1 ? strlen (str) : len;
  96         size_t buffer_size;
  97         size_t left, out_left;
  98
  99         convertor = iconv_open (to_codeset, from_codeset);
 100         if (convertor == (iconv_t) -1){
 101                 *bytes_written = 0;
 102                 *bytes_read = 0;
 103                 return NULL;
 104         }
 105
 106         buffer_size = str_len + 1 + 8;
 107         buffer = g_malloc (buffer_size);
 108         out_left = str_len;
 109         output = buffer;
 110         left = str_len;
 111         while (left > 0){
 112                 int res = iconv (convertor, (char **) &strptr, &left, &output, &out_left);
 113                 if (res == (size_t) -1){
 114                         if (errno == E2BIG){
 115                                 char *n;
 116                                 size_t extra_space = 8 + left;
 117                                 size_t output_used = output - buffer;
 118
 119                                 buffer_size += extra_space;
 120
 121                                 n = g_realloc (buffer, buffer_size);
 122
 123                                 if (n == NULL){
 124                                         if (error != NULL)
 125                                                 *error = g_error_new (NULL, G_CONVERT_ERROR_FAILED, "No memory left");
 126                                         g_free (buffer);
 127                                         result = NULL;
 128                                         goto leave;
 129                                 }
 130                                 buffer = n;
 131                                 out_left += extra_space;
 132                                 output = buffer + output_used;
 133                         } else if (errno == EILSEQ){
 134                                 if (error != NULL)
 135                                         *error = g_error_new (NULL, G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "Invalid multi-byte sequence on input");
 136                                 result = NULL;
 137                                 g_free (buffer);
 138                                 goto leave;
 139                         } else if (errno == EINVAL){
 140                                 if (error != NULL)
 141                                         *error = g_error_new (NULL, G_CONVERT_ERROR_PARTIAL_INPUT, "Partial character sequence");
 142                                 result = NULL;
 143                                 g_free (buffer);
 144                                 goto leave;
 145                         }
 146                 }
 147         }
 148         if (bytes_read != NULL)
 149                 *bytes_read = strptr - str;
 150         if (bytes_written != NULL)
 151                 *bytes_written = output - buffer;
 152         *output = 0;
 153         result = buffer;
 154  leave:
 155         iconv_close (convertor);
 156 #endif
 157         return result;
 158 }
 159
 160 /*
 161  * This is broken, and assumes an UTF8 system, but will do for eglib's first user
 162  */
 163 gchar *
 164 g_filename_from_utf8 (const gchar *utf8string, gssize len, gsize *bytes_read, gsize *bytes_written, GError **error)
 165 {
 166         char *res;
 167
 168         if (len == -1)
 169                 len = strlen (utf8string);
 170
 171         res = g_malloc (len + 1);
 172         g_strlcpy (res, utf8string, len + 1);
 173         return res;
 174 }
 175
 176 gboolean
 177 g_get_charset (G_CONST_RETURN char **charset)
 178 {
 179 #ifdef G_OS_WIN32
 180 #else
 181         if (my_charset == NULL){
 182                 my_charset = g_strdup (nl_langinfo (CODESET));
 183                 is_utf8 = strcmp (my_charset, "UTF-8") == 0;
 184         }
 185
 186         if (charset != NULL)
 187                 *charset = my_charset;
 188
 189 #endif
 190         return is_utf8;
 191 }
 192
 193 gchar *
 194 g_locale_to_utf8 (const gchar *opsysstring, gssize len, gsize *bytes_read, gsize *bytes_written, GError **error)
 195 {
 196         g_get_charset (NULL);
 197
 198         return g_convert (opsysstring, len, "UTF-8", my_charset, bytes_read, bytes_written, error);
 199 }
 200
 201 gchar *
 202 g_locale_from_utf8 (const gchar *utf8string, gssize len, gsize *bytes_read, gsize *bytes_written, GError **error)
 203 {
 204         g_get_charset (NULL);
 205
 206         return g_convert (utf8string, len, my_charset, "UTF-8", bytes_read, bytes_written, error);
 207 }
 208 /**
 209  * g_utf8_validate
 210  * @utf: Pointer to putative UTF-8 encoded string.
 211  *
 212  * Checks @utf for being valid UTF-8. @utf is assumed to be
 213  * null-terminated. This function is not super-strict, as it will
 214  * allow longer UTF-8 sequences than necessary. Note that Java is
 215  * capable of producing these sequences if provoked. Also note, this
 216  * routine checks for the 4-byte maximum size, but does not check for
 217  * 0x10ffff maximum value.
 218  *
 219  * Return value: true if @utf is valid.
 220  **/
 221 gboolean
 222 g_utf8_validate (const gchar *utf, gssize max_len, const gchar **end)
 223 {
 224         int ix;
 225
 226         g_return_val_if_fail (utf != NULL, FALSE);
 227
 228         if (max_len == -1)
 229                 max_len = strlen (utf);
 230
 231         /*
 232          * utf is a string of 1, 2, 3 or 4 bytes.  The valid strings
 233          * are as follows (in "bit format"):
 234          *    0xxxxxxx                                      valid 1-byte
 235          *    110xxxxx 10xxxxxx                             valid 2-byte
 236          *    1110xxxx 10xxxxxx 10xxxxxx                    valid 3-byte
 237          *    11110xxx 10xxxxxx 10xxxxxx 10xxxxxx           valid 4-byte
 238          */
 239         for (ix = 0; ix < max_len;) {      /* string is 0-terminated */
 240                 unsigned char c;
 241
 242                 c = utf[ix];
 243                 if ((c & 0x80) == 0x00) {       /* 1-byte code, starts with 10 */
 244                         ix++;
 245                 } else if ((c & 0xe0) == 0xc0) {/* 2-byte code, starts with 110 */
 246                         if (((ix+1) >= max_len) || (utf[ix+1] & 0xc0 ) != 0x80){
 247                                 if (end != NULL)
 248                                         *end = &utf [ix];
 249                                 return FALSE;
 250                         }
 251                         ix += 2;
 252                 } else if ((c & 0xf0) == 0xe0) {/* 3-byte code, starts with 1110 */
 253                         if (((ix + 2) >= max_len) ||
 254                             ((utf[ix+1] & 0xc0) != 0x80) ||
 255                             ((utf[ix+2] & 0xc0) != 0x80)){
 256                                 if (end != NULL)
 257                                         *end = &utf [ix];
 258                                 return FALSE;
 259                         }
 260                         ix += 3;
 261                 } else if ((c & 0xf8) == 0xf0) {/* 4-byte code, starts with 11110 */
 262                         if (((ix + 3) >= max_len) ||
 263                             ((utf[ix+1] & 0xc0) != 0x80) ||
 264                             ((utf[ix+2] & 0xc0) != 0x80) ||
 265                             ((utf[ix+3] & 0xc0) != 0x80)){
 266                                 if (end != NULL)
 267                                         *end = &utf [ix];
 268                                 return FALSE;
 269                         }
 270                         ix += 4;
 271                 } else {/* unknown encoding */
 272                         if (end != NULL)
 273                                 *end = &utf [ix];
 274                         return FALSE;
 275                 }
 276         }
 277
 278         return TRUE;
 279 }