2 * ucm2cp.c - Convert IBM ".ucm" files or hexadecimal mapping ".TXT" files
3 * into code page handling classes.
5 * Copyright (c) 2002 Southern Storm Software, Pty Ltd
6 * Copyright (c) 2006 Bruno Haible
7 * Copyright (c) 2013 Mikko Korkalo
9 * Permission is hereby granted, free of charge, to any person obtaining
10 * a copy of this software and associated documentation files (the "Software"),
11 * to deal in the Software without restriction, including without limitation
12 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
13 * and/or sell copies of the Software, and to permit persons to whom the
14 * Software is furnished to do so, subject to the following conditions:
16 * The above copyright notice and this permission notice shall be included
17 * in all copies or substantial portions of the Software.
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
20 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
22 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
23 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
24 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 * OTHER DEALINGS IN THE SOFTWARE.
30 Usage: ucm2cp [options] file
32 --region name I18N region name
33 --page num Code page number
34 --wpage num Windows code page number (optional)
35 --name str Human-readable encoding name
36 --webname str Web name of the encoding
37 --headername str Header name of the encoding (optional)
38 --bodyname str Body name of the encoding (optional)
39 --no-browser-display Set browser display value to false (optional)
40 --no-browser-save Set browser save value to false (optional)
41 --no-mailnews-display Set mail/news display value to false (optional)
42 --no-mailnews-save Set mail/news save value to false (optional)
53 static char *region = 0;
54 static int codePage = 0;
55 static int windowsCodePage = 0;
56 static char *name = 0;
57 static char *webName = 0;
58 static char *headerName = 0;
59 static char *bodyName = 0;
60 static int isBrowserDisplay = 1;
61 static int isBrowserSave = 1;
62 static int isMailNewsDisplay = 1;
63 static int isMailNewsSave = 1;
64 static const char *filename = 0;
67 * Forward declarations.
69 static void usage(char *progname);
70 static void loadCharMaps(FILE *file);
71 static void printHeader(void);
72 static void printFooter(void);
73 static void printByteToChar(void);
74 static void printCharToByte(void);
76 int main(int argc, char *argv[])
78 char *progname = argv[0];
82 /* Process the command-line options */
83 while(argc > 1 && argv[1][0] == '-')
85 if(!strcmp(argv[1], "--page") && argc > 2)
87 codePage = atoi(argv[2]);
91 else if(!strcmp(argv[1], "--wpage") && argc > 2)
93 windowsCodePage = atoi(argv[2]);
97 else if(!strcmp(argv[1], "--region") && argc > 2)
103 else if(!strcmp(argv[1], "--name") && argc > 2)
109 else if(!strcmp(argv[1], "--webname") && argc > 2)
115 else if(!strcmp(argv[1], "--headername") && argc > 2)
117 headerName = argv[2];
121 else if(!strcmp(argv[1], "--bodyname") && argc > 2)
127 else if(!strcmp(argv[1], "--no-browser-display"))
129 isBrowserDisplay = 0;
131 else if(!strcmp(argv[1], "--no-browser-save"))
135 else if(!strcmp(argv[1], "--no-mailnews-display"))
137 isMailNewsDisplay = 0;
139 else if(!strcmp(argv[1], "--no-mailnews-save"))
147 /* Make sure that we have sufficient options */
148 if(!region || !codePage || !name || !webName || argc != 2)
154 /* Set defaults for unspecified options */
157 headerName = webName;
165 windowsCodePage = codePage;
168 /* Open the UCM or TXT file */
169 file = fopen(argv[1], "r");
176 len = strlen(filename);
177 while(len > 0 && filename[len - 1] != '/' && filename[len - 1] != '\\')
183 /* Load the character maps from the input file */
186 /* Print the output header */
189 /* Print the byte->char conversion table */
192 /* Output the char->byte conversion methods */
195 /* Print the output footer */
198 /* Clean up and exit */
203 static void usage(char *progname)
205 fprintf(stderr, "Usage: %s [options] file\n\n", progname);
206 fprintf(stderr, " --region name I18N region name\n");
207 fprintf(stderr, " --page num Code page number\n");
208 fprintf(stderr, " --wpage num Windows code page number (optional)\n");
209 fprintf(stderr, " --name str Human-readable encoding name\n");
210 fprintf(stderr, " --webname str Web name of the encoding\n");
211 fprintf(stderr, " --headername str Header name of the encoding (optional)\n");
212 fprintf(stderr, " --bodyname str Body name of the encoding (optional)\n");
213 fprintf(stderr, " --no-browser-display Set browser display value to false (optional)\n");
214 fprintf(stderr, " --no-browser-save Set browser save value to false (optional)\n");
215 fprintf(stderr, " --no-mailnews-display Set mail/news display value to false (optional)\n");
216 fprintf(stderr, " --no-mailnews-save Set mail/news save value to false (optional)\n");
220 * Map bytes to characters. The level value is used to determine
221 * which char mapping is the most likely if there is more than one.
223 static unsigned byteToChar[256];
224 static int byteToCharLevel[256];
227 * Map characters to bytes.
229 static int charToByte[65536];
232 * Parse a hexadecimal value. Returns the length
233 * of the value that was parsed.
235 static int parseHex(const char *buf, unsigned long *value)
240 while((ch = buf[len]) != '\0')
242 if(ch >= '0' && ch <= '9')
244 *value = *value * 16 + (unsigned long)(ch - '0');
246 else if(ch >= 'A' && ch <= 'F')
248 *value = *value * 16 + (unsigned long)(ch - 'A' + 10);
250 else if(ch >= 'a' && ch <= 'f')
252 *value = *value * 16 + (unsigned long)(ch - 'a' + 10);
264 * Load the character mapping information from a UCM or TXT file.
266 static void loadCharMaps(FILE *file)
268 enum { unknown, ucm, txt } syntax;
270 unsigned long byteValue;
275 /* Initialize the mapping tables */
276 for(posn = 0; posn < 256; ++posn)
278 byteToChar[posn] = (unsigned)'?';
279 byteToCharLevel[posn] = 100;
281 for(posn = 0; posn < 65536; ++posn)
283 charToByte[posn] = -1;
288 /* Read the contents of the file */
289 while(fgets(buffer, BUFSIZ, file))
291 /* Syntax recognition */
292 if (syntax == unknown)
294 if (memcmp(buffer, "CHARMAP", 7) == 0)
296 else if (memcmp(buffer, "0x", 2) == 0)
302 /* Lines of interest begin with "<U" */
303 if(buffer[0] != '<' || buffer[1] != 'U')
308 /* Parse the fields on the line */
310 buf += parseHex(buf, &posn);
315 while(*buf != '\0' && *buf != '\\')
319 if(*buf != '\\' || buf[1] != 'x')
324 buf += parseHex(buf, &byteValue);
329 while(*buf != '\0' && *buf != '|')
337 level = (int)(buf[1] - '0');
345 /* Lines of interest begin with "0x" */
346 if(buffer[0] != '0' || buffer[1] != 'x')
349 /* Parse the fields on the line */
350 if(sscanf(buffer, "0x%x%n", &x, &cnt) <= 0)
355 while (buffer[cnt] == ' ' || buffer[cnt] == '\t')
357 if(sscanf(buffer+cnt, "0x%x", &x) != 1)
367 /* Update the byte->char mapping table */
368 if(level < byteToCharLevel[byteValue])
370 byteToCharLevel[byteValue] = level;
371 byteToChar[byteValue] = (unsigned)posn;
374 /* Update the char->byte mapping table */
375 charToByte[posn] = (int)byteValue;
379 #define COPYRIGHT_MSG \
381 " * Copyright (c) 2002 Southern Storm Software, Pty Ltd\n" \
383 " * Permission is hereby granted, free of charge, to any person obtaining\n" \
384 " * a copy of this software and associated documentation files (the \"Software\"),\n" \
385 " * to deal in the Software without restriction, including without limitation\n" \
386 " * the rights to use, copy, modify, merge, publish, distribute, sublicense,\n" \
387 " * and/or sell copies of the Software, and to permit persons to whom the\n" \
388 " * Software is furnished to do so, subject to the following conditions:\n" \
390 " * The above copyright notice and this permission notice shall be included\n" \
391 " * in all copies or substantial portions of the Software.\n" \
393 " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n" \
394 " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n" \
395 " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n" \
396 " * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR\n" \
397 " * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,\n" \
398 " * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR\n" \
399 " * OTHER DEALINGS IN THE SOFTWARE.\n" \
403 * Print the header for the current code page definition.
405 static void printHeader(void)
407 printf("/*\n * CP%d.cs - %s code page.\n", codePage, name);
408 fputs(COPYRIGHT_MSG, stdout);
409 printf("// Generated from \"%s\".\n\n", filename);
410 printf("// WARNING: Modifying this file directly might be a bad idea.\n");
411 printf("// You should edit the code generator tools/ucm2cp.c instead for your changes\n");
412 printf("// to appear in all relevant classes.\n");
413 printf("namespace I18N.%s\n{\n\n", region);
414 printf("using System;\n");
415 printf("using System.Text;\n");
416 printf("using I18N.Common;\n\n");
417 printf("[Serializable]\n");
418 printf("public class CP%d : ByteEncoding\n{\n", codePage);
419 printf("\tpublic CP%d()\n", codePage);
420 printf("\t\t: base(%d, ToChars, \"%s\",\n", codePage, name);
421 printf("\t\t \"%s\", \"%s\", \"%s\",\n",
422 bodyName, headerName, webName);
423 printf("\t\t %s, %s, %s, %s, %d)\n",
424 (isBrowserDisplay ? "true" : "false"),
425 (isBrowserSave ? "true" : "false"),
426 (isMailNewsDisplay ? "true" : "false"),
427 (isMailNewsSave ? "true" : "false"),
433 * Print an encoding name, adjusted to look like a type name.
435 static void printEncodingName(const char *name)
439 if(*name >= 'A' && *name <= 'Z')
441 putc(*name - 'A' + 'a', stdout);
443 else if(*name == '-')
456 * Print the footer for the current code page definition.
458 static void printFooter(void)
460 printf("}; // class CP%d\n\n", codePage);
461 printf("[Serializable]\n");
462 printf("public class ENC");
463 printEncodingName(webName);
464 printf(" : CP%d\n{\n", codePage);
465 printf("\tpublic ENC");
466 printEncodingName(webName);
467 printf("() : base() {}\n\n");
468 printf("}; // class ENC");
469 printEncodingName(webName);
470 printf("\n\n}; // namespace I18N.%s\n", region);
474 * Print the byte->char conversion table.
476 static void printByteToChar(void)
479 printf("\tprivate static readonly char[] ToChars = {");
480 for(posn = 0; posn < 256; ++posn)
486 printf("'\\u%04X', ", byteToChar[posn]);
488 printf("\n\t};\n\n");
492 * Print a "switch" statement that converts "ch" from
493 * a character value into a byte value.
495 static void printConvertSwitch(int forString)
497 unsigned long directLimit;
500 unsigned long rangeSize;
504 /* Find the limit of direct byte mappings */
506 while(directLimit < 256 && charToByte[directLimit] == (int)directLimit)
511 /* Determine if we have the full-width Latin1 mappings, which
512 we can optimise in the default case of the switch */
514 for(posn = 0xFF01; posn <= 0xFF5E; ++posn)
516 if((charToByte[posn] - 0x21) != (int)(posn - 0xFF01))
522 /* Print the switch header. The "if" is an optimisation
523 to ignore the common case of direct ASCII mappings */
524 printf("\t\t\tif(ch >= %lu) switch(ch)\n", directLimit);
527 /* Handle all direct byte mappings above the direct limit */
529 for(posn = directLimit; posn < 256; ++posn)
531 if(charToByte[posn] == (int)posn)
534 printf("\t\t\t\tcase 0x%04lX:\n", posn);
539 printf("\t\t\t\t\tbreak;\n");
542 /* Handle the indirect mappings */
543 for(posn = 0; posn < 65536; ++posn)
545 if(haveFullWidth && posn >= 0xFF01 && posn <= 0xFF5E)
547 /* Handle full-width Latin1 conversions later */
550 if(charToByte[posn] != (int)posn &&
551 charToByte[posn] != -1)
553 /* See if we have a run of 4 or more characters that
554 can be mapped algorithmically to some other range */
556 for(posn2 = posn + 1; posn2 < 65536; ++posn2)
558 if(charToByte[posn2] == (int)posn2 ||
559 charToByte[posn2] == -1)
563 if((charToByte[posn2] - charToByte[posn]) !=
572 /* Output a range mapping for the characters */
573 for(posn2 = posn; posn2 < (posn + rangeSize); ++posn2)
575 printf("\t\t\t\tcase 0x%04lX:\n", posn2);
577 posn += rangeSize - 1;
578 if(((long)posn) >= (long)(charToByte[posn]))
580 printf("\t\t\t\t\tch -= 0x%04lX;\n",
581 (long)(posn - charToByte[posn]));
585 printf("\t\t\t\t\tch += 0x%04lX;\n",
586 (long)(charToByte[posn] - posn));
588 printf("\t\t\t\t\tbreak;\n");
592 /* Use a simple non-algorithmic mapping */
593 printf("\t\t\t\tcase 0x%04lX: ch = 0x%02X; break;\n",
594 posn, (unsigned)(charToByte[posn]));
599 /* Print the switch footer */
603 printf("\t\t\t\tdefault: ch = 0x3F; break;\n");
605 printf("\t\t\t\tdefault:\n");
606 printf("\t\t\t\t\tHandleFallback (ref buffer, chars, ref charIndex, ref charCount, bytes, ref byteIndex, ref byteCount);\n");
607 printf("\t\t\t\t\tcontinue;\n");
612 printf("\t\t\t\tdefault:\n");
613 printf("\t\t\t\t{\n");
614 printf("\t\t\t\t\tif(ch >= 0xFF01 && ch <= 0xFF5E)\n");
615 printf("\t\t\t\t\t{\n");
616 printf("\t\t\t\t\t\tch -= 0xFEE0;\n");
617 printf("\t\t\t\t\t}\n");
618 printf("\t\t\t\t\telse\n");
619 printf("\t\t\t\t\t{\n");
620 if(forString) /* this is basically meaningless, just to make diff for unused code minimum */
621 printf("\t\t\t\t\t\tch = 0x3F;\n");
623 printf("\t\t\t\t\t\tHandleFallback (ref buffer, chars, ref charIndex, ref charCount, bytes, ref byteIndex, ref byteCount);\n");
624 printf("\t\t\t\t\t\tcontinue;\n");
626 printf("\t\t\t\t\t}\n");
627 printf("\t\t\t\t}\n");
628 printf("\t\t\t\tbreak;\n");
634 * Print the char->byte conversion methods.
636 static void printCharToByte(void)
638 printf("\t// Get the number of bytes needed to encode a character buffer.\n");
639 printf("\tpublic unsafe override int GetByteCountImpl (char* chars, int count)\n");
641 printf("\t\tif (this.EncoderFallback != null)");
643 printf("\t\t\t//Calculate byte count by actually doing encoding and discarding the data.\n");
644 printf("\t\t\treturn GetBytesImpl(chars, count, null, 0);\n");
646 printf("\t\telse\n");
649 printf("\t\t\treturn count;\n");
653 printf("\t// Get the number of bytes needed to encode a character buffer.\n");
654 printf("\tpublic override int GetByteCount (String s)\n");
656 printf("\t\tif (this.EncoderFallback != null)\n");
658 printf("\t\t\t//Calculate byte count by actually doing encoding and discarding the data.\n");
659 printf("\t\t\tunsafe\n");
661 printf("\t\t\t\tfixed (char *s_ptr = s)\n");
662 printf("\t\t\t\t{\n");
663 printf("\t\t\t\t\treturn GetBytesImpl(s_ptr, s.Length, null, 0);\n");
664 printf("\t\t\t\t}\n");
667 printf("\t\telse\n");
669 printf("\t\t\t//byte count equals character count because no EncoderFallback set\n");
670 printf("\t\t\treturn s.Length;\n");
674 printf("\t//ToBytes is just an alias for GetBytesImpl, but doesn't return byte count\n");
675 printf("\tprotected unsafe override void ToBytes(char* chars, int charCount,\n");
676 printf("\t byte* bytes, int byteCount)\n");
678 printf("\t\t//Calling ToBytes with null destination buffer doesn't make any sense\n");
679 printf("\t\tif (bytes == null)\n");
680 printf("\t\t\tthrow new ArgumentNullException(\"bytes\");\n");
681 printf("\t\tGetBytesImpl(chars, charCount, bytes, byteCount);\n");
685 /* Print the conversion method for character buffers */
686 //printf("\tprotected unsafe override void ToBytes(char* chars, int charCount,\n");
687 //printf("\t byte* bytes, int byteCount)\n");
688 printf("\tpublic unsafe override int GetBytesImpl (char* chars, int charCount,\n");
689 printf("\t byte* bytes, int byteCount)\n");
691 printf("\t\tint ch;\n");
692 printf("\t\tint charIndex = 0;\n");
693 printf("\t\tint byteIndex = 0;\n");
694 printf("\t\tEncoderFallbackBuffer buffer = null;\n");
695 printf("\t\twhile (charCount > 0)\n");
697 printf("\t\t\tch = (int)(chars[charIndex]);\n");
698 printf("\t\t\tcharIndex++;\n");
699 printf("\t\t\tcharCount--;\n");
700 printConvertSwitch(0);
701 printf("\t\t\t//Write encoded byte to buffer, if buffer is defined and fallback was not used\n");
702 printf("\t\t\tif (bytes != null)\n");
703 printf("\t\t\t\tbytes[byteIndex] = (byte)ch;\n");
704 printf("\t\t\tbyteIndex++;\n");
705 printf("\t\t\tbyteCount--;\n");
707 printf("\t\treturn byteIndex;\n");
710 /* Print the conversion method for string buffers */
712 printf("\tprotected override void ToBytes(String s, int charIndex, int charCount,\n");
713 printf("\t byte[] bytes, int byteIndex)\n");
715 printf("\t\tint ch;\n");
716 printf("\t\twhile(charCount > 0)\n");
718 printf("\t\t\tch = (int)(s[charIndex++]);\n");
719 printConvertSwitch(1);
720 printf("\t\t\tbytes[byteIndex++] = (byte)ch;\n");
721 printf("\t\t\t--charCount;\n");