This commit was manufactured by cvs2svn to create branch 'mono-1-0'.
[mono.git] / mcs / class / I18N / tools / ucm2cp.c
1 /*
2  * ucm2cp.c - Convert IBM ".ucm" files into code page handling classes.
3  *
4  * Copyright (c) 2002  Southern Storm Software, Pty Ltd
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining
7  * a copy of this software and associated documentation files (the "Software"),
8  * to deal in the Software without restriction, including without limitation
9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10  * and/or sell copies of the Software, and to permit persons to whom the
11  * Software is furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included
14  * in all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22  * OTHER DEALINGS IN THE SOFTWARE.
23  */
24
25 /*
26
27 Usage: ucm2cp [options] file
28
29         --region name                   I18N region name
30         --page num                              Code page number
31         --wpage num                             Windows code page number (optional)
32         --name str                              Human-readable encoding name
33         --webname str                   Web name of the encoding
34         --headername str                Header name of the encoding (optional)
35         --bodyname str                  Body name of the encoding (optional)
36         --no-browser-display    Set browser display value to false (optional)
37         --no-browser-save               Set browser save value to false (optional)
38         --no-mailnews-display   Set mail/news display value to false (optional)
39         --no-mailnews-save              Set mail/news save value to false (optional)
40
41 */
42
43 #include <stdio.h>
44 #include <string.h>
45 #include <stdlib.h>
46
47 /*
48  * Option values.
49  */
50 static char *region = 0;
51 static int codePage = 0;
52 static int windowsCodePage = 0;
53 static char *name = 0;
54 static char *webName = 0;
55 static char *headerName = 0;
56 static char *bodyName = 0;
57 static int isBrowserDisplay = 1;
58 static int isBrowserSave = 1;
59 static int isMailNewsDisplay = 1;
60 static int isMailNewsSave = 1;
61 static const char *filename = 0;
62
63 /*
64  * Forward declarations.
65  */
66 static void usage(char *progname);
67 static void loadCharMaps(FILE *file);
68 static void printHeader(void);
69 static void printFooter(void);
70 static void printByteToChar(void);
71 static void printCharToByte(void);
72
73 int main(int argc, char *argv[])
74 {
75         char *progname = argv[0];
76         FILE *file;
77         int len;
78
79         /* Process the command-line options */
80         while(argc > 1 && argv[1][0] == '-')
81         {
82                 if(!strcmp(argv[1], "--page") && argc > 2)
83                 {
84                         codePage = atoi(argv[2]);
85                         ++argv;
86                         --argc;
87                 }
88                 else if(!strcmp(argv[1], "--wpage") && argc > 2)
89                 {
90                         windowsCodePage = atoi(argv[2]);
91                         ++argv;
92                         --argc;
93                 }
94                 else if(!strcmp(argv[1], "--region") && argc > 2)
95                 {
96                         region = argv[2];
97                         ++argv;
98                         --argc;
99                 }
100                 else if(!strcmp(argv[1], "--name") && argc > 2)
101                 {
102                         name = argv[2];
103                         ++argv;
104                         --argc;
105                 }
106                 else if(!strcmp(argv[1], "--webname") && argc > 2)
107                 {
108                         webName = argv[2];
109                         ++argv;
110                         --argc;
111                 }
112                 else if(!strcmp(argv[1], "--headername") && argc > 2)
113                 {
114                         headerName = argv[2];
115                         ++argv;
116                         --argc;
117                 }
118                 else if(!strcmp(argv[1], "--bodyname") && argc > 2)
119                 {
120                         bodyName = argv[2];
121                         ++argv;
122                         --argc;
123                 }
124                 else if(!strcmp(argv[1], "--no-browser-display"))
125                 {
126                         isBrowserDisplay = 0;
127                 }
128                 else if(!strcmp(argv[1], "--no-browser-save"))
129                 {
130                         isBrowserSave = 0;
131                 }
132                 else if(!strcmp(argv[1], "--no-mailnews-display"))
133                 {
134                         isMailNewsDisplay = 0;
135                 }
136                 else if(!strcmp(argv[1], "--no-mailnews-save"))
137                 {
138                         isMailNewsSave = 0;
139                 }
140                 ++argv;
141                 --argc;
142         }
143
144         /* Make sure that we have sufficient options */
145         if(!region || !codePage || !name || !webName || argc != 2)
146         {
147                 usage(progname);
148                 return 1;
149         }
150
151         /* Set defaults for unspecified options */
152         if(!headerName)
153         {
154                 headerName = webName;
155         }
156         if(!bodyName)
157         {
158                 bodyName = webName;
159         }
160         if(!windowsCodePage)
161         {
162                 windowsCodePage = codePage;
163         }
164
165         /* Open the UCM file */
166         file = fopen(argv[1], "r");
167         if(!file)
168         {
169                 perror(argv[1]);
170                 return 1;
171         }
172         filename = argv[1];
173         len = strlen(filename);
174         while(len > 0 && filename[len - 1] != '/' && filename[len - 1] != '\\')
175         {
176                 --len;
177         }
178         filename += len;
179
180         /* Load the character maps from the input file */
181         loadCharMaps(file);
182
183         /* Print the output header */
184         printHeader();
185
186         /* Print the byte->char conversion table */
187         printByteToChar();
188
189         /* Output the char->byte conversion methods */
190         printCharToByte();
191
192         /* Print the output footer */
193         printFooter();
194
195         /* Clean up and exit */
196         fclose(file);
197         return 0;
198 }
199
200 static void usage(char *progname)
201 {
202         fprintf(stderr, "Usage: %s [options] file\n\n", progname);
203         fprintf(stderr, "    --region name         I18N region name\n");
204         fprintf(stderr, "    --page num            Code page number\n");
205         fprintf(stderr, "    --wpage num           Windows code page number (optional)\n");
206         fprintf(stderr, "    --name str            Human-readable encoding name\n");
207         fprintf(stderr, "    --webname str         Web name of the encoding\n");
208         fprintf(stderr, "    --headername str      Header name of the encoding (optional)\n");
209         fprintf(stderr, "    --bodyname str        Body name of the encoding (optional)\n");
210         fprintf(stderr, "    --no-browser-display  Set browser display value to false (optional)\n");
211         fprintf(stderr, "    --no-browser-save     Set browser save value to false (optional)\n");
212         fprintf(stderr, "    --no-mailnews-display Set mail/news display value to false (optional)\n");
213         fprintf(stderr, "    --no-mailnews-save    Set mail/news save value to false (optional)\n");
214 }
215
216 /*
217  * Map bytes to characters.  The level value is used to determine
218  * which char mapping is the most likely if there is more than one.
219  */
220 static unsigned byteToChar[256];
221 static int      byteToCharLevel[256];
222
223 /*
224  * Map characters to bytes.
225  */
226 static int charToByte[65536];
227
228 /*
229  * Parse a hexadecimal value.  Returns the length
230  * of the value that was parsed.
231  */
232 static int parseHex(const char *buf, unsigned long *value)
233 {
234         int len = 0;
235         char ch;
236         *value = 0;
237         while((ch = buf[len]) != '\0')
238         {
239                 if(ch >= '0' && ch <= '9')
240                 {
241                         *value = *value * 16 + (unsigned long)(ch - '0');
242                 }
243                 else if(ch >= 'A' && ch <= 'F')
244                 {
245                         *value = *value * 16 + (unsigned long)(ch - 'A' + 10);
246                 }
247                 else if(ch >= 'a' && ch <= 'f')
248                 {
249                         *value = *value * 16 + (unsigned long)(ch - 'a' + 10);
250                 }
251                 else
252                 {
253                         break;
254                 }
255                 ++len;
256         }
257         return len;
258 }
259
260 /*
261  * Load the character mapping information from a UCM file.
262  */
263 static void loadCharMaps(FILE *file)
264 {
265         unsigned long posn;
266         unsigned long byteValue;
267         int level;
268         char buffer[BUFSIZ];
269         const char *buf;
270
271         /* Initialize the mapping tables */
272         for(posn = 0; posn < 256; ++posn)
273         {
274                 byteToChar[posn] = (unsigned)'?';
275                 byteToCharLevel[posn] = 100;
276         }
277         for(posn = 0; posn < 65536; ++posn)
278         {
279                 charToByte[posn] = -1;
280         }
281
282         /* Read the contents of the file */
283         while(fgets(buffer, BUFSIZ, file))
284         {
285                 /* Lines of interest begin with "<U" */
286                 if(buffer[0] != '<' || buffer[1] != 'U')
287                 {
288                         continue;
289                 }
290
291                 /* Parse the fields on the line */
292                 buf = buffer + 2;
293                 buf += parseHex(buf, &posn);
294                 if(posn >= 65536)
295                 {
296                         continue;
297                 }
298                 while(*buf != '\0' && *buf != '\\')
299                 {
300                         ++buf;
301                 }
302                 if(*buf != '\\' || buf[1] != 'x')
303                 {
304                         continue;
305                 }
306                 buf += 2;
307                 buf += parseHex(buf, &byteValue);
308                 if(byteValue >= 256)
309                 {
310                         continue;
311                 }
312                 while(*buf != '\0' && *buf != '|')
313                 {
314                         ++buf;
315                 }
316                 if(*buf != '|')
317                 {
318                         continue;
319                 }
320                 level = (int)(buf[1] - '0');
321
322                 /* Update the byte->char mapping table */
323                 if(level < byteToCharLevel[byteValue])
324                 {
325                         byteToCharLevel[byteValue] = level;
326                         byteToChar[byteValue] = (unsigned)posn;
327                 }
328
329                 /* Update the char->byte mapping table */
330                 charToByte[posn] = (int)byteValue;
331         }
332 }
333
334 #define COPYRIGHT_MSG \
335 " *\n" \
336 " * Copyright (c) 2002  Southern Storm Software, Pty Ltd\n" \
337 " *\n" \
338 " * Permission is hereby granted, free of charge, to any person obtaining\n" \
339 " * a copy of this software and associated documentation files (the \"Software\"),\n" \
340 " * to deal in the Software without restriction, including without limitation\n" \
341 " * the rights to use, copy, modify, merge, publish, distribute, sublicense,\n" \
342 " * and/or sell copies of the Software, and to permit persons to whom the\n" \
343 " * Software is furnished to do so, subject to the following conditions:\n" \
344 " *\n" \
345 " * The above copyright notice and this permission notice shall be included\n" \
346 " * in all copies or substantial portions of the Software.\n" \
347 " *\n" \
348 " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n" \
349 " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n" \
350 " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n" \
351 " * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR\n" \
352 " * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,\n" \
353 " * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR\n" \
354 " * OTHER DEALINGS IN THE SOFTWARE.\n" \
355 " */\n\n"
356
357 /*
358  * Print the header for the current code page definition.
359  */
360 static void printHeader(void)
361 {
362         printf("/*\n * CP%d.cs - %s code page.\n", codePage, name);
363         fputs(COPYRIGHT_MSG, stdout);
364         printf("// Generated from \"%s\".\n\n", filename);
365         printf("namespace I18N.%s\n{\n\n", region);
366         printf("using System;\n");
367         printf("using I18N.Common;\n\n");
368         printf("public class CP%d : ByteEncoding\n{\n", codePage);
369         printf("\tpublic CP%d()\n", codePage);
370         printf("\t\t: base(%d, ToChars, \"%s\",\n", codePage, name);
371         printf("\t\t       \"%s\", \"%s\", \"%s\",\n",
372                bodyName, headerName, webName);
373         printf("\t\t       %s, %s, %s, %s, %d)\n",
374                    (isBrowserDisplay ? "true" : "false"),
375                    (isBrowserSave ? "true" : "false"),
376                    (isMailNewsDisplay ? "true" : "false"),
377                    (isMailNewsSave ? "true" : "false"),
378                    windowsCodePage);
379         printf("\t{}\n\n");
380 }
381
382 /*
383  * Print an encoding name, adjusted to look like a type name.
384  */
385 static void printEncodingName(const char *name)
386 {
387         while(*name != '\0')
388         {
389                 if(*name >= 'A' && *name <= 'Z')
390                 {
391                         putc(*name - 'A' + 'a', stdout);
392                 }
393                 else if(*name == '-')
394                 {
395                         putc('_', stdout);
396                 }
397                 else
398                 {
399                         putc(*name, stdout);
400                 }
401                 ++name;
402         }
403 }
404
405 /*
406  * Print the footer for the current code page definition.
407  */
408 static void printFooter(void)
409 {
410         printf("}; // class CP%d\n\n", codePage);
411         printf("public class ENC");
412         printEncodingName(webName);
413         printf(" : CP%d\n{\n", codePage);
414         printf("\tpublic ENC");
415         printEncodingName(webName);
416         printf("() : base() {}\n\n");
417         printf("}; // class ENC");
418         printEncodingName(webName);
419         printf("\n\n}; // namespace I18N.%s\n", region);
420 }
421
422 /*
423  * Print the byte->char conversion table.
424  */
425 static void printByteToChar(void)
426 {
427         int posn;
428         printf("\tprivate static readonly char[] ToChars = {");
429         for(posn = 0; posn < 256; ++posn)
430         {
431                 if((posn % 6) == 0)
432                 {
433                         printf("\n\t\t");
434                 }
435                 printf("'\\u%04X', ", byteToChar[posn]);
436         }
437         printf("\n\t};\n\n");
438 }
439
440 /*
441  * Print a "switch" statement that converts "ch" from
442  * a character value into a byte value.
443  */
444 static void printConvertSwitch(void)
445 {
446         unsigned long directLimit;
447         unsigned long posn;
448         unsigned long posn2;
449         unsigned long rangeSize;
450         int haveDirect;
451         int haveFullWidth;
452
453         /* Find the limit of direct byte mappings */
454         directLimit = 0;
455         while(directLimit < 256 && charToByte[directLimit] == (int)directLimit)
456         {
457                 ++directLimit;
458         }
459
460         /* Determine if we have the full-width Latin1 mappings, which
461            we can optimise in the default case of the switch */
462         haveFullWidth = 1;
463         for(posn = 0xFF01; posn <= 0xFF5E; ++posn)
464         {
465                 if((charToByte[posn] - 0x21) != (int)(posn - 0xFF01))
466                 {
467                         haveFullWidth = 0;
468                 }
469         }
470
471         /* Print the switch header.  The "if" is an optimisation
472            to ignore the common case of direct ASCII mappings */
473         printf("\t\t\tif(ch >= %lu) switch(ch)\n", directLimit);
474         printf("\t\t\t{\n");
475
476         /* Handle all direct byte mappings above the direct limit */
477         haveDirect = 0;
478         for(posn = directLimit; posn < 256; ++posn)
479         {
480                 if(charToByte[posn] == (int)posn)
481                 {
482                         haveDirect = 1;
483                         printf("\t\t\t\tcase 0x%04lX:\n", posn);
484                 }
485         }
486         if(haveDirect)
487         {
488                 printf("\t\t\t\t\tbreak;\n");
489         }
490
491         /* Handle the indirect mappings */
492         for(posn = 0; posn < 65536; ++posn)
493         {
494                 if(haveFullWidth && posn >= 0xFF01 && posn <= 0xFF5E)
495                 {
496                         /* Handle full-width Latin1 conversions later */
497                         continue;
498                 }
499                 if(charToByte[posn] != (int)posn &&
500                    charToByte[posn] != -1)
501                 {
502                         /* See if we have a run of 4 or more characters that
503                            can be mapped algorithmically to some other range */
504                         rangeSize = 1;
505                         for(posn2 = posn + 1; posn2 < 65536; ++posn2)
506                         {
507                                 if(charToByte[posn2] == (int)posn2 ||
508                                    charToByte[posn2] == -1)
509                                 {
510                                         break;
511                                 }
512                                 if((charToByte[posn2] - charToByte[posn]) !=
513                                    (int)(posn2 - posn))
514                                 {
515                                         break;
516                                 }
517                                 ++rangeSize;
518                         }
519                         if(rangeSize >= 4)
520                         {
521                                 /* Output a range mapping for the characters */
522                                 for(posn2 = posn; posn2 < (posn + rangeSize); ++posn2)
523                                 {
524                                         printf("\t\t\t\tcase 0x%04lX:\n", posn2);
525                                 }
526                                 posn += rangeSize - 1;
527                                 if(((long)posn) >= (long)(charToByte[posn]))
528                                 {
529                                         printf("\t\t\t\t\tch -= 0x%04lX;\n",
530                                                    (long)(posn - charToByte[posn]));
531                                 }
532                                 else
533                                 {
534                                         printf("\t\t\t\t\tch += 0x%04lX;\n",
535                                                    (long)(charToByte[posn] - posn));
536                                 }
537                                 printf("\t\t\t\t\tbreak;\n");
538                         }
539                         else
540                         {
541                                 /* Use a simple non-algorithmic mapping */
542                                 printf("\t\t\t\tcase 0x%04lX: ch = 0x%02X; break;\n",
543                                            posn, (unsigned)(charToByte[posn]));
544                         }
545                 }
546         }
547
548         /* Print the switch footer */
549         if(!haveFullWidth)
550         {
551                 printf("\t\t\t\tdefault: ch = 0x3F; break;\n");
552         }
553         else
554         {
555                 printf("\t\t\t\tdefault:\n");
556                 printf("\t\t\t\t{\n");
557                 printf("\t\t\t\t\tif(ch >= 0xFF01 && ch <= 0xFF5E)\n");
558                 printf("\t\t\t\t\t\tch -= 0xFEE0;\n");
559                 printf("\t\t\t\t\telse\n");
560                 printf("\t\t\t\t\t\tch = 0x3F;\n");
561                 printf("\t\t\t\t}\n");
562                 printf("\t\t\t\tbreak;\n");
563         }
564         printf("\t\t\t}\n");
565 }
566
567 /*
568  * Print the char->byte conversion methods.
569  */
570 static void printCharToByte(void)
571 {
572         /* Print the conversion method for character buffers */
573         printf("\tprotected override void ToBytes(char[] chars, int charIndex, int charCount,\n");
574         printf("\t                                byte[] bytes, int byteIndex)\n");
575         printf("\t{\n");
576         printf("\t\tint ch;\n");
577         printf("\t\twhile(charCount > 0)\n");
578         printf("\t\t{\n");
579         printf("\t\t\tch = (int)(chars[charIndex++]);\n");
580         printConvertSwitch();
581         printf("\t\t\tbytes[byteIndex++] = (byte)ch;\n");
582         printf("\t\t\t--charCount;\n");
583         printf("\t\t}\n");
584         printf("\t}\n\n");
585
586         /* Print the conversion method for string buffers */
587         printf("\tprotected override void ToBytes(String s, int charIndex, int charCount,\n");
588         printf("\t                                byte[] bytes, int byteIndex)\n");
589         printf("\t{\n");
590         printf("\t\tint ch;\n");
591         printf("\t\twhile(charCount > 0)\n");
592         printf("\t\t{\n");
593         printf("\t\t\tch = (int)(s[charIndex++]);\n");
594         printConvertSwitch();
595         printf("\t\t\tbytes[byteIndex++] = (byte)ch;\n");
596         printf("\t\t\t--charCount;\n");
597         printf("\t\t}\n");
598         printf("\t}\n\n");
599 }