2 * ucm2cp.c - Convert IBM ".ucm" files into code page handling classes.
4 * Copyright (c) 2002 Southern Storm Software, Pty Ltd
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
27 Usage: ucm2cp [options] file
29 --region name I18N region name
30 --page num Code page number
31 --wpage num Windows code page number (optional)
32 --name str Human-readable encoding name
33 --webname str Web name of the encoding
34 --headername str Header name of the encoding (optional)
35 --bodyname str Body name of the encoding (optional)
36 --no-browser-display Set browser display value to false (optional)
37 --no-browser-save Set browser save value to false (optional)
38 --no-mailnews-display Set mail/news display value to false (optional)
39 --no-mailnews-save Set mail/news save value to false (optional)
50 static char *region = 0;
51 static int codePage = 0;
52 static int windowsCodePage = 0;
53 static char *name = 0;
54 static char *webName = 0;
55 static char *headerName = 0;
56 static char *bodyName = 0;
57 static int isBrowserDisplay = 1;
58 static int isBrowserSave = 1;
59 static int isMailNewsDisplay = 1;
60 static int isMailNewsSave = 1;
61 static const char *filename = 0;
64 * Forward declarations.
66 static void usage(char *progname);
67 static void loadCharMaps(FILE *file);
68 static void printHeader(void);
69 static void printFooter(void);
70 static void printByteToChar(void);
71 static void printCharToByte(void);
73 int main(int argc, char *argv[])
75 char *progname = argv[0];
79 /* Process the command-line options */
80 while(argc > 1 && argv[1][0] == '-')
82 if(!strcmp(argv[1], "--page") && argc > 2)
84 codePage = atoi(argv[2]);
88 else if(!strcmp(argv[1], "--wpage") && argc > 2)
90 windowsCodePage = atoi(argv[2]);
94 else if(!strcmp(argv[1], "--region") && argc > 2)
100 else if(!strcmp(argv[1], "--name") && argc > 2)
106 else if(!strcmp(argv[1], "--webname") && argc > 2)
112 else if(!strcmp(argv[1], "--headername") && argc > 2)
114 headerName = argv[2];
118 else if(!strcmp(argv[1], "--bodyname") && argc > 2)
124 else if(!strcmp(argv[1], "--no-browser-display"))
126 isBrowserDisplay = 0;
128 else if(!strcmp(argv[1], "--no-browser-save"))
132 else if(!strcmp(argv[1], "--no-mailnews-display"))
134 isMailNewsDisplay = 0;
136 else if(!strcmp(argv[1], "--no-mailnews-save"))
144 /* Make sure that we have sufficient options */
145 if(!region || !codePage || !name || !webName || argc != 2)
151 /* Set defaults for unspecified options */
154 headerName = webName;
162 windowsCodePage = codePage;
165 /* Open the UCM file */
166 file = fopen(argv[1], "r");
173 len = strlen(filename);
174 while(len > 0 && filename[len - 1] != '/' && filename[len - 1] != '\\')
180 /* Load the character maps from the input file */
183 /* Print the output header */
186 /* Print the byte->char conversion table */
189 /* Output the char->byte conversion methods */
192 /* Print the output footer */
195 /* Clean up and exit */
200 static void usage(char *progname)
202 fprintf(stderr, "Usage: %s [options] file\n\n", progname);
203 fprintf(stderr, " --region name I18N region name\n");
204 fprintf(stderr, " --page num Code page number\n");
205 fprintf(stderr, " --wpage num Windows code page number (optional)\n");
206 fprintf(stderr, " --name str Human-readable encoding name\n");
207 fprintf(stderr, " --webname str Web name of the encoding\n");
208 fprintf(stderr, " --headername str Header name of the encoding (optional)\n");
209 fprintf(stderr, " --bodyname str Body name of the encoding (optional)\n");
210 fprintf(stderr, " --no-browser-display Set browser display value to false (optional)\n");
211 fprintf(stderr, " --no-browser-save Set browser save value to false (optional)\n");
212 fprintf(stderr, " --no-mailnews-display Set mail/news display value to false (optional)\n");
213 fprintf(stderr, " --no-mailnews-save Set mail/news save value to false (optional)\n");
217 * Map bytes to characters. The level value is used to determine
218 * which char mapping is the most likely if there is more than one.
220 static unsigned byteToChar[256];
221 static int byteToCharLevel[256];
224 * Map characters to bytes.
226 static int charToByte[65536];
229 * Parse a hexadecimal value. Returns the length
230 * of the value that was parsed.
232 static int parseHex(const char *buf, unsigned long *value)
237 while((ch = buf[len]) != '\0')
239 if(ch >= '0' && ch <= '9')
241 *value = *value * 16 + (unsigned long)(ch - '0');
243 else if(ch >= 'A' && ch <= 'F')
245 *value = *value * 16 + (unsigned long)(ch - 'A' + 10);
247 else if(ch >= 'a' && ch <= 'f')
249 *value = *value * 16 + (unsigned long)(ch - 'a' + 10);
261 * Load the character mapping information from a UCM file.
263 static void loadCharMaps(FILE *file)
266 unsigned long byteValue;
271 /* Initialize the mapping tables */
272 for(posn = 0; posn < 256; ++posn)
274 byteToChar[posn] = (unsigned)'?';
275 byteToCharLevel[posn] = 100;
277 for(posn = 0; posn < 65536; ++posn)
279 charToByte[posn] = -1;
282 /* Read the contents of the file */
283 while(fgets(buffer, BUFSIZ, file))
285 /* Lines of interest begin with "<U" */
286 if(buffer[0] != '<' || buffer[1] != 'U')
291 /* Parse the fields on the line */
293 buf += parseHex(buf, &posn);
298 while(*buf != '\0' && *buf != '\\')
302 if(*buf != '\\' || buf[1] != 'x')
307 buf += parseHex(buf, &byteValue);
312 while(*buf != '\0' && *buf != '|')
320 level = (int)(buf[1] - '0');
322 /* Update the byte->char mapping table */
323 if(level < byteToCharLevel[byteValue])
325 byteToCharLevel[byteValue] = level;
326 byteToChar[byteValue] = (unsigned)posn;
329 /* Update the char->byte mapping table */
330 charToByte[posn] = (int)byteValue;
334 #define COPYRIGHT_MSG \
336 " * Copyright (c) 2002 Southern Storm Software, Pty Ltd\n" \
338 " * Permission is hereby granted, free of charge, to any person obtaining\n" \
339 " * a copy of this software and associated documentation files (the \"Software\"),\n" \
340 " * to deal in the Software without restriction, including without limitation\n" \
341 " * the rights to use, copy, modify, merge, publish, distribute, sublicense,\n" \
342 " * and/or sell copies of the Software, and to permit persons to whom the\n" \
343 " * Software is furnished to do so, subject to the following conditions:\n" \
345 " * The above copyright notice and this permission notice shall be included\n" \
346 " * in all copies or substantial portions of the Software.\n" \
348 " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n" \
349 " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n" \
350 " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n" \
351 " * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR\n" \
352 " * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,\n" \
353 " * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR\n" \
354 " * OTHER DEALINGS IN THE SOFTWARE.\n" \
358 * Print the header for the current code page definition.
360 static void printHeader(void)
362 printf("/*\n * CP%d.cs - %s code page.\n", codePage, name);
363 fputs(COPYRIGHT_MSG, stdout);
364 printf("// Generated from \"%s\".\n\n", filename);
365 printf("namespace I18N.%s\n{\n\n", region);
366 printf("using System;\n");
367 printf("using I18N.Common;\n\n");
368 printf("public class CP%d : ByteEncoding\n{\n", codePage);
369 printf("\tpublic CP%d()\n", codePage);
370 printf("\t\t: base(%d, ToChars, \"%s\",\n", codePage, name);
371 printf("\t\t \"%s\", \"%s\", \"%s\",\n",
372 bodyName, headerName, webName);
373 printf("\t\t %s, %s, %s, %s, %d)\n",
374 (isBrowserDisplay ? "true" : "false"),
375 (isBrowserSave ? "true" : "false"),
376 (isMailNewsDisplay ? "true" : "false"),
377 (isMailNewsSave ? "true" : "false"),
383 * Print an encoding name, adjusted to look like a type name.
385 static void printEncodingName(const char *name)
389 if(*name >= 'A' && *name <= 'Z')
391 putc(*name - 'A' + 'a', stdout);
393 else if(*name == '-')
406 * Print the footer for the current code page definition.
408 static void printFooter(void)
410 printf("}; // class CP%d\n\n", codePage);
411 printf("public class ENC");
412 printEncodingName(webName);
413 printf(" : CP%d\n{\n", codePage);
414 printf("\tpublic ENC");
415 printEncodingName(webName);
416 printf("() : base() {}\n\n");
417 printf("}; // class ENC");
418 printEncodingName(webName);
419 printf("\n\n}; // namespace I18N.%s\n", region);
423 * Print the byte->char conversion table.
425 static void printByteToChar(void)
428 printf("\tprivate static readonly char[] ToChars = {");
429 for(posn = 0; posn < 256; ++posn)
435 printf("'\\u%04X', ", byteToChar[posn]);
437 printf("\n\t};\n\n");
441 * Print a "switch" statement that converts "ch" from
442 * a character value into a byte value.
444 static void printConvertSwitch(void)
446 unsigned long directLimit;
449 unsigned long rangeSize;
453 /* Find the limit of direct byte mappings */
455 while(directLimit < 256 && charToByte[directLimit] == (int)directLimit)
460 /* Determine if we have the full-width Latin1 mappings, which
461 we can optimise in the default case of the switch */
463 for(posn = 0xFF01; posn <= 0xFF5E; ++posn)
465 if((charToByte[posn] - 0x21) != (int)(posn - 0xFF01))
471 /* Print the switch header. The "if" is an optimisation
472 to ignore the common case of direct ASCII mappings */
473 printf("\t\t\tif(ch >= %lu) switch(ch)\n", directLimit);
476 /* Handle all direct byte mappings above the direct limit */
478 for(posn = directLimit; posn < 256; ++posn)
480 if(charToByte[posn] == (int)posn)
483 printf("\t\t\t\tcase 0x%04lX:\n", posn);
488 printf("\t\t\t\t\tbreak;\n");
491 /* Handle the indirect mappings */
492 for(posn = 0; posn < 65536; ++posn)
494 if(haveFullWidth && posn >= 0xFF01 && posn <= 0xFF5E)
496 /* Handle full-width Latin1 conversions later */
499 if(charToByte[posn] != (int)posn &&
500 charToByte[posn] != -1)
502 /* See if we have a run of 4 or more characters that
503 can be mapped algorithmically to some other range */
505 for(posn2 = posn + 1; posn2 < 65536; ++posn2)
507 if(charToByte[posn2] == (int)posn2 ||
508 charToByte[posn2] == -1)
512 if((charToByte[posn2] - charToByte[posn]) !=
521 /* Output a range mapping for the characters */
522 for(posn2 = posn; posn2 < (posn + rangeSize); ++posn2)
524 printf("\t\t\t\tcase 0x%04lX:\n", posn2);
526 posn += rangeSize - 1;
527 if(((long)posn) >= (long)(charToByte[posn]))
529 printf("\t\t\t\t\tch -= 0x%04lX;\n",
530 (long)(posn - charToByte[posn]));
534 printf("\t\t\t\t\tch += 0x%04lX;\n",
535 (long)(charToByte[posn] - posn));
537 printf("\t\t\t\t\tbreak;\n");
541 /* Use a simple non-algorithmic mapping */
542 printf("\t\t\t\tcase 0x%04lX: ch = 0x%02X; break;\n",
543 posn, (unsigned)(charToByte[posn]));
548 /* Print the switch footer */
551 printf("\t\t\t\tdefault: ch = 0x3F; break;\n");
555 printf("\t\t\t\tdefault:\n");
556 printf("\t\t\t\t{\n");
557 printf("\t\t\t\t\tif(ch >= 0xFF01 && ch <= 0xFF5E)\n");
558 printf("\t\t\t\t\t\tch -= 0xFEE0;\n");
559 printf("\t\t\t\t\telse\n");
560 printf("\t\t\t\t\t\tch = 0x3F;\n");
561 printf("\t\t\t\t}\n");
562 printf("\t\t\t\tbreak;\n");
568 * Print the char->byte conversion methods.
570 static void printCharToByte(void)
572 /* Print the conversion method for character buffers */
573 printf("\tprotected override void ToBytes(char[] chars, int charIndex, int charCount,\n");
574 printf("\t byte[] bytes, int byteIndex)\n");
576 printf("\t\tint ch;\n");
577 printf("\t\twhile(charCount > 0)\n");
579 printf("\t\t\tch = (int)(chars[charIndex++]);\n");
580 printConvertSwitch();
581 printf("\t\t\tbytes[byteIndex++] = (byte)ch;\n");
582 printf("\t\t\t--charCount;\n");
586 /* Print the conversion method for string buffers */
587 printf("\tprotected override void ToBytes(String s, int charIndex, int charCount,\n");
588 printf("\t byte[] bytes, int byteIndex)\n");
590 printf("\t\tint ch;\n");
591 printf("\t\twhile(charCount > 0)\n");
593 printf("\t\t\tch = (int)(s[charIndex++]);\n");
594 printConvertSwitch();
595 printf("\t\t\tbytes[byteIndex++] = (byte)ch;\n");
596 printf("\t\t\t--charCount;\n");