2 * ucm2cp.c - Convert IBM ".ucm" files or hexadecimal mapping ".TXT" files
3 * into code page handling classes.
5 * Copyright (c) 2002 Southern Storm Software, Pty Ltd
6 * Copyright (c) 2006 Bruno Haible
8 * Permission is hereby granted, free of charge, to any person obtaining
9 * a copy of this software and associated documentation files (the "Software"),
10 * to deal in the Software without restriction, including without limitation
11 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12 * and/or sell copies of the Software, and to permit persons to whom the
13 * Software is furnished to do so, subject to the following conditions:
15 * The above copyright notice and this permission notice shall be included
16 * in all copies or substantial portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
22 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
23 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24 * OTHER DEALINGS IN THE SOFTWARE.
29 Usage: ucm2cp [options] file
31 --region name I18N region name
32 --page num Code page number
33 --wpage num Windows code page number (optional)
34 --name str Human-readable encoding name
35 --webname str Web name of the encoding
36 --headername str Header name of the encoding (optional)
37 --bodyname str Body name of the encoding (optional)
38 --no-browser-display Set browser display value to false (optional)
39 --no-browser-save Set browser save value to false (optional)
40 --no-mailnews-display Set mail/news display value to false (optional)
41 --no-mailnews-save Set mail/news save value to false (optional)
52 static char *region = 0;
53 static int codePage = 0;
54 static int windowsCodePage = 0;
55 static char *name = 0;
56 static char *webName = 0;
57 static char *headerName = 0;
58 static char *bodyName = 0;
59 static int isBrowserDisplay = 1;
60 static int isBrowserSave = 1;
61 static int isMailNewsDisplay = 1;
62 static int isMailNewsSave = 1;
63 static const char *filename = 0;
66 * Forward declarations.
68 static void usage(char *progname);
69 static void loadCharMaps(FILE *file);
70 static void printHeader(void);
71 static void printFooter(void);
72 static void printByteToChar(void);
73 static void printCharToByte(void);
75 int main(int argc, char *argv[])
77 char *progname = argv[0];
81 /* Process the command-line options */
82 while(argc > 1 && argv[1][0] == '-')
84 if(!strcmp(argv[1], "--page") && argc > 2)
86 codePage = atoi(argv[2]);
90 else if(!strcmp(argv[1], "--wpage") && argc > 2)
92 windowsCodePage = atoi(argv[2]);
96 else if(!strcmp(argv[1], "--region") && argc > 2)
102 else if(!strcmp(argv[1], "--name") && argc > 2)
108 else if(!strcmp(argv[1], "--webname") && argc > 2)
114 else if(!strcmp(argv[1], "--headername") && argc > 2)
116 headerName = argv[2];
120 else if(!strcmp(argv[1], "--bodyname") && argc > 2)
126 else if(!strcmp(argv[1], "--no-browser-display"))
128 isBrowserDisplay = 0;
130 else if(!strcmp(argv[1], "--no-browser-save"))
134 else if(!strcmp(argv[1], "--no-mailnews-display"))
136 isMailNewsDisplay = 0;
138 else if(!strcmp(argv[1], "--no-mailnews-save"))
146 /* Make sure that we have sufficient options */
147 if(!region || !codePage || !name || !webName || argc != 2)
153 /* Set defaults for unspecified options */
156 headerName = webName;
164 windowsCodePage = codePage;
167 /* Open the UCM or TXT file */
168 file = fopen(argv[1], "r");
175 len = strlen(filename);
176 while(len > 0 && filename[len - 1] != '/' && filename[len - 1] != '\\')
182 /* Load the character maps from the input file */
185 /* Print the output header */
188 /* Print the byte->char conversion table */
191 /* Output the char->byte conversion methods */
194 /* Print the output footer */
197 /* Clean up and exit */
202 static void usage(char *progname)
204 fprintf(stderr, "Usage: %s [options] file\n\n", progname);
205 fprintf(stderr, " --region name I18N region name\n");
206 fprintf(stderr, " --page num Code page number\n");
207 fprintf(stderr, " --wpage num Windows code page number (optional)\n");
208 fprintf(stderr, " --name str Human-readable encoding name\n");
209 fprintf(stderr, " --webname str Web name of the encoding\n");
210 fprintf(stderr, " --headername str Header name of the encoding (optional)\n");
211 fprintf(stderr, " --bodyname str Body name of the encoding (optional)\n");
212 fprintf(stderr, " --no-browser-display Set browser display value to false (optional)\n");
213 fprintf(stderr, " --no-browser-save Set browser save value to false (optional)\n");
214 fprintf(stderr, " --no-mailnews-display Set mail/news display value to false (optional)\n");
215 fprintf(stderr, " --no-mailnews-save Set mail/news save value to false (optional)\n");
219 * Map bytes to characters. The level value is used to determine
220 * which char mapping is the most likely if there is more than one.
222 static unsigned byteToChar[256];
223 static int byteToCharLevel[256];
226 * Map characters to bytes.
228 static int charToByte[65536];
231 * Parse a hexadecimal value. Returns the length
232 * of the value that was parsed.
234 static int parseHex(const char *buf, unsigned long *value)
239 while((ch = buf[len]) != '\0')
241 if(ch >= '0' && ch <= '9')
243 *value = *value * 16 + (unsigned long)(ch - '0');
245 else if(ch >= 'A' && ch <= 'F')
247 *value = *value * 16 + (unsigned long)(ch - 'A' + 10);
249 else if(ch >= 'a' && ch <= 'f')
251 *value = *value * 16 + (unsigned long)(ch - 'a' + 10);
263 * Load the character mapping information from a UCM or TXT file.
265 static void loadCharMaps(FILE *file)
267 enum { unknown, ucm, txt } syntax;
269 unsigned long byteValue;
274 /* Initialize the mapping tables */
275 for(posn = 0; posn < 256; ++posn)
277 byteToChar[posn] = (unsigned)'?';
278 byteToCharLevel[posn] = 100;
280 for(posn = 0; posn < 65536; ++posn)
282 charToByte[posn] = -1;
287 /* Read the contents of the file */
288 while(fgets(buffer, BUFSIZ, file))
290 /* Syntax recognition */
291 if (syntax == unknown)
293 if (memcmp(buffer, "CHARMAP", 7) == 0)
295 else if (memcmp(buffer, "0x", 2) == 0)
301 /* Lines of interest begin with "<U" */
302 if(buffer[0] != '<' || buffer[1] != 'U')
307 /* Parse the fields on the line */
309 buf += parseHex(buf, &posn);
314 while(*buf != '\0' && *buf != '\\')
318 if(*buf != '\\' || buf[1] != 'x')
323 buf += parseHex(buf, &byteValue);
328 while(*buf != '\0' && *buf != '|')
336 level = (int)(buf[1] - '0');
344 /* Lines of interest begin with "0x" */
345 if(buffer[0] != '0' || buffer[1] != 'x')
348 /* Parse the fields on the line */
349 if(sscanf(buffer, "0x%x%n", &x, &cnt) <= 0)
354 while (buffer[cnt] == ' ' || buffer[cnt] == '\t')
356 if(sscanf(buffer+cnt, "0x%x", &x) != 1)
366 /* Update the byte->char mapping table */
367 if(level < byteToCharLevel[byteValue])
369 byteToCharLevel[byteValue] = level;
370 byteToChar[byteValue] = (unsigned)posn;
373 /* Update the char->byte mapping table */
374 charToByte[posn] = (int)byteValue;
378 #define COPYRIGHT_MSG \
380 " * Copyright (c) 2002 Southern Storm Software, Pty Ltd\n" \
382 " * Permission is hereby granted, free of charge, to any person obtaining\n" \
383 " * a copy of this software and associated documentation files (the \"Software\"),\n" \
384 " * to deal in the Software without restriction, including without limitation\n" \
385 " * the rights to use, copy, modify, merge, publish, distribute, sublicense,\n" \
386 " * and/or sell copies of the Software, and to permit persons to whom the\n" \
387 " * Software is furnished to do so, subject to the following conditions:\n" \
389 " * The above copyright notice and this permission notice shall be included\n" \
390 " * in all copies or substantial portions of the Software.\n" \
392 " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n" \
393 " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n" \
394 " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n" \
395 " * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR\n" \
396 " * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,\n" \
397 " * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR\n" \
398 " * OTHER DEALINGS IN THE SOFTWARE.\n" \
402 * Print the header for the current code page definition.
404 static void printHeader(void)
406 printf("/*\n * CP%d.cs - %s code page.\n", codePage, name);
407 fputs(COPYRIGHT_MSG, stdout);
408 printf("// Generated from \"%s\".\n\n", filename);
409 printf("namespace I18N.%s\n{\n\n", region);
410 printf("using System;\n");
411 printf("using System.Text;\n");
412 printf("using I18N.Common;\n\n");
413 printf("[Serializable]\n");
414 printf("public class CP%d : ByteEncoding\n{\n", codePage);
415 printf("\tpublic CP%d()\n", codePage);
416 printf("\t\t: base(%d, ToChars, \"%s\",\n", codePage, name);
417 printf("\t\t \"%s\", \"%s\", \"%s\",\n",
418 bodyName, headerName, webName);
419 printf("\t\t %s, %s, %s, %s, %d)\n",
420 (isBrowserDisplay ? "true" : "false"),
421 (isBrowserSave ? "true" : "false"),
422 (isMailNewsDisplay ? "true" : "false"),
423 (isMailNewsSave ? "true" : "false"),
429 * Print an encoding name, adjusted to look like a type name.
431 static void printEncodingName(const char *name)
435 if(*name >= 'A' && *name <= 'Z')
437 putc(*name - 'A' + 'a', stdout);
439 else if(*name == '-')
452 * Print the footer for the current code page definition.
454 static void printFooter(void)
456 printf("}; // class CP%d\n\n", codePage);
457 printf("[Serializable]\n");
458 printf("public class ENC");
459 printEncodingName(webName);
460 printf(" : CP%d\n{\n", codePage);
461 printf("\tpublic ENC");
462 printEncodingName(webName);
463 printf("() : base() {}\n\n");
464 printf("}; // class ENC");
465 printEncodingName(webName);
466 printf("\n\n}; // namespace I18N.%s\n", region);
470 * Print the byte->char conversion table.
472 static void printByteToChar(void)
475 printf("\tprivate static readonly char[] ToChars = {");
476 for(posn = 0; posn < 256; ++posn)
482 printf("'\\u%04X', ", byteToChar[posn]);
484 printf("\n\t};\n\n");
488 * Print a "switch" statement that converts "ch" from
489 * a character value into a byte value.
491 static void printConvertSwitch(int forString)
493 unsigned long directLimit;
496 unsigned long rangeSize;
500 /* Find the limit of direct byte mappings */
502 while(directLimit < 256 && charToByte[directLimit] == (int)directLimit)
507 /* Determine if we have the full-width Latin1 mappings, which
508 we can optimise in the default case of the switch */
510 for(posn = 0xFF01; posn <= 0xFF5E; ++posn)
512 if((charToByte[posn] - 0x21) != (int)(posn - 0xFF01))
518 /* Print the switch header. The "if" is an optimisation
519 to ignore the common case of direct ASCII mappings */
520 printf("\t\t\tif(ch >= %lu) switch(ch)\n", directLimit);
523 /* Handle all direct byte mappings above the direct limit */
525 for(posn = directLimit; posn < 256; ++posn)
527 if(charToByte[posn] == (int)posn)
530 printf("\t\t\t\tcase 0x%04lX:\n", posn);
535 printf("\t\t\t\t\tbreak;\n");
538 /* Handle the indirect mappings */
539 for(posn = 0; posn < 65536; ++posn)
541 if(haveFullWidth && posn >= 0xFF01 && posn <= 0xFF5E)
543 /* Handle full-width Latin1 conversions later */
546 if(charToByte[posn] != (int)posn &&
547 charToByte[posn] != -1)
549 /* See if we have a run of 4 or more characters that
550 can be mapped algorithmically to some other range */
552 for(posn2 = posn + 1; posn2 < 65536; ++posn2)
554 if(charToByte[posn2] == (int)posn2 ||
555 charToByte[posn2] == -1)
559 if((charToByte[posn2] - charToByte[posn]) !=
568 /* Output a range mapping for the characters */
569 for(posn2 = posn; posn2 < (posn + rangeSize); ++posn2)
571 printf("\t\t\t\tcase 0x%04lX:\n", posn2);
573 posn += rangeSize - 1;
574 if(((long)posn) >= (long)(charToByte[posn]))
576 printf("\t\t\t\t\tch -= 0x%04lX;\n",
577 (long)(posn - charToByte[posn]));
581 printf("\t\t\t\t\tch += 0x%04lX;\n",
582 (long)(charToByte[posn] - posn));
584 printf("\t\t\t\t\tbreak;\n");
588 /* Use a simple non-algorithmic mapping */
589 printf("\t\t\t\tcase 0x%04lX: ch = 0x%02X; break;\n",
590 posn, (unsigned)(charToByte[posn]));
595 /* Print the switch footer */
599 printf("\t\t\t\tdefault: ch = 0x3F; break;\n");
601 printf("\t\t\t\tdefault:\n");
602 printf("#if NET_2_0\n");
603 printf("\t\t\t\t\tHandleFallback (ref buffer, chars, ref charIndex, ref charCount, bytes, ref byteIndex, ref byteCount);\n");
605 printf("\t\t\t\t\t\tch = 0x3F;\n");
607 printf("\t\t\t\t\tbreak;\n");
612 printf("\t\t\t\tdefault:\n");
613 printf("\t\t\t\t{\n");
614 printf("\t\t\t\t\tif(ch >= 0xFF01 && ch <= 0xFF5E)\n");
615 printf("\t\t\t\t\t\tch -= 0xFEE0;\n");
616 printf("\t\t\t\t\telse\n");
617 if(forString) /* this is basically meaningless, just to make diff for unused code minimum */
618 printf("\t\t\t\t\t\tch = 0x3F;\n");
620 printf("#if NET_2_0\n");
621 printf("\t\t\t\t\t\tHandleFallback (ref buffer, chars, ref charIndex, ref charCount, bytes, ref byteIndex, ref byteCount);\n");
623 printf("\t\t\t\t\t\tch = 0x3F;\n");
626 printf("\t\t\t\t}\n");
627 printf("\t\t\t\tbreak;\n");
633 * Print the char->byte conversion methods.
635 static void printCharToByte(void)
637 /* Print the conversion method for character buffers */
638 printf("\tprotected unsafe override void ToBytes(char* chars, int charCount,\n");
639 printf("\t byte* bytes, int byteCount)\n");
641 printf("\t\tint ch;\n");
642 printf("\t\tint charIndex = 0;\n");
643 printf("\t\tint byteIndex = 0;\n");
644 printf("#if NET_2_0\n");
645 printf("\t\tEncoderFallbackBuffer buffer = null;\n");
647 printf("\t\twhile(charCount > 0)\n");
649 printf("\t\t\tch = (int)(chars[charIndex++]);\n");
650 printConvertSwitch(0);
651 printf("\t\t\tbytes[byteIndex++] = (byte)ch;\n");
652 printf("\t\t\t--charCount;\n");
653 printf("\t\t\t--byteCount;\n");
657 /* Print the conversion method for string buffers */
659 printf("\tprotected override void ToBytes(String s, int charIndex, int charCount,\n");
660 printf("\t byte[] bytes, int byteIndex)\n");
662 printf("\t\tint ch;\n");
663 printf("\t\twhile(charCount > 0)\n");
665 printf("\t\t\tch = (int)(s[charIndex++]);\n");
666 printConvertSwitch(1);
667 printf("\t\t\tbytes[byteIndex++] = (byte)ch;\n");
668 printf("\t\t\t--charCount;\n");