New test.
[mono.git] / mcs / class / I18N / tools / ucm2cp.c
1 /*
2  * ucm2cp.c - Convert IBM ".ucm" files or hexadecimal mapping ".TXT" files
3  * into code page handling classes.
4  *
5  * Copyright (c) 2002  Southern Storm Software, Pty Ltd
6  * Copyright (c) 2006  Bruno Haible
7  *
8  * Permission is hereby granted, free of charge, to any person obtaining
9  * a copy of this software and associated documentation files (the "Software"),
10  * to deal in the Software without restriction, including without limitation
11  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
12  * and/or sell copies of the Software, and to permit persons to whom the
13  * Software is furnished to do so, subject to the following conditions:
14  *
15  * The above copyright notice and this permission notice shall be included
16  * in all copies or substantial portions of the Software.
17  *
18  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
21  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
22  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
23  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
24  * OTHER DEALINGS IN THE SOFTWARE.
25  */
26
27 /*
28
29 Usage: ucm2cp [options] file
30
31         --region name                   I18N region name
32         --page num                              Code page number
33         --wpage num                             Windows code page number (optional)
34         --name str                              Human-readable encoding name
35         --webname str                   Web name of the encoding
36         --headername str                Header name of the encoding (optional)
37         --bodyname str                  Body name of the encoding (optional)
38         --no-browser-display    Set browser display value to false (optional)
39         --no-browser-save               Set browser save value to false (optional)
40         --no-mailnews-display   Set mail/news display value to false (optional)
41         --no-mailnews-save              Set mail/news save value to false (optional)
42
43 */
44
45 #include <stdio.h>
46 #include <string.h>
47 #include <stdlib.h>
48
49 /*
50  * Option values.
51  */
52 static char *region = 0;
53 static int codePage = 0;
54 static int windowsCodePage = 0;
55 static char *name = 0;
56 static char *webName = 0;
57 static char *headerName = 0;
58 static char *bodyName = 0;
59 static int isBrowserDisplay = 1;
60 static int isBrowserSave = 1;
61 static int isMailNewsDisplay = 1;
62 static int isMailNewsSave = 1;
63 static const char *filename = 0;
64
65 /*
66  * Forward declarations.
67  */
68 static void usage(char *progname);
69 static void loadCharMaps(FILE *file);
70 static void printHeader(void);
71 static void printFooter(void);
72 static void printByteToChar(void);
73 static void printCharToByte(void);
74
75 int main(int argc, char *argv[])
76 {
77         char *progname = argv[0];
78         FILE *file;
79         int len;
80
81         /* Process the command-line options */
82         while(argc > 1 && argv[1][0] == '-')
83         {
84                 if(!strcmp(argv[1], "--page") && argc > 2)
85                 {
86                         codePage = atoi(argv[2]);
87                         ++argv;
88                         --argc;
89                 }
90                 else if(!strcmp(argv[1], "--wpage") && argc > 2)
91                 {
92                         windowsCodePage = atoi(argv[2]);
93                         ++argv;
94                         --argc;
95                 }
96                 else if(!strcmp(argv[1], "--region") && argc > 2)
97                 {
98                         region = argv[2];
99                         ++argv;
100                         --argc;
101                 }
102                 else if(!strcmp(argv[1], "--name") && argc > 2)
103                 {
104                         name = argv[2];
105                         ++argv;
106                         --argc;
107                 }
108                 else if(!strcmp(argv[1], "--webname") && argc > 2)
109                 {
110                         webName = argv[2];
111                         ++argv;
112                         --argc;
113                 }
114                 else if(!strcmp(argv[1], "--headername") && argc > 2)
115                 {
116                         headerName = argv[2];
117                         ++argv;
118                         --argc;
119                 }
120                 else if(!strcmp(argv[1], "--bodyname") && argc > 2)
121                 {
122                         bodyName = argv[2];
123                         ++argv;
124                         --argc;
125                 }
126                 else if(!strcmp(argv[1], "--no-browser-display"))
127                 {
128                         isBrowserDisplay = 0;
129                 }
130                 else if(!strcmp(argv[1], "--no-browser-save"))
131                 {
132                         isBrowserSave = 0;
133                 }
134                 else if(!strcmp(argv[1], "--no-mailnews-display"))
135                 {
136                         isMailNewsDisplay = 0;
137                 }
138                 else if(!strcmp(argv[1], "--no-mailnews-save"))
139                 {
140                         isMailNewsSave = 0;
141                 }
142                 ++argv;
143                 --argc;
144         }
145
146         /* Make sure that we have sufficient options */
147         if(!region || !codePage || !name || !webName || argc != 2)
148         {
149                 usage(progname);
150                 return 1;
151         }
152
153         /* Set defaults for unspecified options */
154         if(!headerName)
155         {
156                 headerName = webName;
157         }
158         if(!bodyName)
159         {
160                 bodyName = webName;
161         }
162         if(!windowsCodePage)
163         {
164                 windowsCodePage = codePage;
165         }
166
167         /* Open the UCM or TXT file */
168         file = fopen(argv[1], "r");
169         if(!file)
170         {
171                 perror(argv[1]);
172                 return 1;
173         }
174         filename = argv[1];
175         len = strlen(filename);
176         while(len > 0 && filename[len - 1] != '/' && filename[len - 1] != '\\')
177         {
178                 --len;
179         }
180         filename += len;
181
182         /* Load the character maps from the input file */
183         loadCharMaps(file);
184
185         /* Print the output header */
186         printHeader();
187
188         /* Print the byte->char conversion table */
189         printByteToChar();
190
191         /* Output the char->byte conversion methods */
192         printCharToByte();
193
194         /* Print the output footer */
195         printFooter();
196
197         /* Clean up and exit */
198         fclose(file);
199         return 0;
200 }
201
202 static void usage(char *progname)
203 {
204         fprintf(stderr, "Usage: %s [options] file\n\n", progname);
205         fprintf(stderr, "    --region name         I18N region name\n");
206         fprintf(stderr, "    --page num            Code page number\n");
207         fprintf(stderr, "    --wpage num           Windows code page number (optional)\n");
208         fprintf(stderr, "    --name str            Human-readable encoding name\n");
209         fprintf(stderr, "    --webname str         Web name of the encoding\n");
210         fprintf(stderr, "    --headername str      Header name of the encoding (optional)\n");
211         fprintf(stderr, "    --bodyname str        Body name of the encoding (optional)\n");
212         fprintf(stderr, "    --no-browser-display  Set browser display value to false (optional)\n");
213         fprintf(stderr, "    --no-browser-save     Set browser save value to false (optional)\n");
214         fprintf(stderr, "    --no-mailnews-display Set mail/news display value to false (optional)\n");
215         fprintf(stderr, "    --no-mailnews-save    Set mail/news save value to false (optional)\n");
216 }
217
218 /*
219  * Map bytes to characters.  The level value is used to determine
220  * which char mapping is the most likely if there is more than one.
221  */
222 static unsigned byteToChar[256];
223 static int      byteToCharLevel[256];
224
225 /*
226  * Map characters to bytes.
227  */
228 static int charToByte[65536];
229
230 /*
231  * Parse a hexadecimal value.  Returns the length
232  * of the value that was parsed.
233  */
234 static int parseHex(const char *buf, unsigned long *value)
235 {
236         int len = 0;
237         char ch;
238         *value = 0;
239         while((ch = buf[len]) != '\0')
240         {
241                 if(ch >= '0' && ch <= '9')
242                 {
243                         *value = *value * 16 + (unsigned long)(ch - '0');
244                 }
245                 else if(ch >= 'A' && ch <= 'F')
246                 {
247                         *value = *value * 16 + (unsigned long)(ch - 'A' + 10);
248                 }
249                 else if(ch >= 'a' && ch <= 'f')
250                 {
251                         *value = *value * 16 + (unsigned long)(ch - 'a' + 10);
252                 }
253                 else
254                 {
255                         break;
256                 }
257                 ++len;
258         }
259         return len;
260 }
261
262 /*
263  * Load the character mapping information from a UCM or TXT file.
264  */
265 static void loadCharMaps(FILE *file)
266 {
267         enum { unknown, ucm, txt } syntax;
268         unsigned long posn;
269         unsigned long byteValue;
270         int level;
271         char buffer[BUFSIZ];
272         const char *buf;
273
274         /* Initialize the mapping tables */
275         for(posn = 0; posn < 256; ++posn)
276         {
277                 byteToChar[posn] = (unsigned)'?';
278                 byteToCharLevel[posn] = 100;
279         }
280         for(posn = 0; posn < 65536; ++posn)
281         {
282                 charToByte[posn] = -1;
283         }
284
285         syntax = unknown;
286
287         /* Read the contents of the file */
288         while(fgets(buffer, BUFSIZ, file))
289         {
290                 /* Syntax recognition */
291                 if (syntax == unknown)
292                 {
293                         if (memcmp(buffer, "CHARMAP", 7) == 0)
294                                 syntax = ucm;
295                         else if (memcmp(buffer, "0x", 2) == 0)
296                                 syntax = txt;
297                 }
298
299                 if (syntax == ucm)
300                 {
301                         /* Lines of interest begin with "<U" */
302                         if(buffer[0] != '<' || buffer[1] != 'U')
303                         {
304                                 continue;
305                         }
306
307                         /* Parse the fields on the line */
308                         buf = buffer + 2;
309                         buf += parseHex(buf, &posn);
310                         if(posn >= 65536)
311                         {
312                                 continue;
313                         }
314                         while(*buf != '\0' && *buf != '\\')
315                         {
316                                 ++buf;
317                         }
318                         if(*buf != '\\' || buf[1] != 'x')
319                         {
320                                 continue;
321                         }
322                         buf += 2;
323                         buf += parseHex(buf, &byteValue);
324                         if(byteValue >= 256)
325                         {
326                                 continue;
327                         }
328                         while(*buf != '\0' && *buf != '|')
329                         {
330                                 ++buf;
331                         }
332                         if(*buf != '|')
333                         {
334                                 continue;
335                         }
336                         level = (int)(buf[1] - '0');
337                 }
338                 else
339                 if (syntax == txt)
340                 {
341                         unsigned int x;
342                         int cnt;
343
344                         /* Lines of interest begin with "0x" */
345                         if(buffer[0] != '0' || buffer[1] != 'x')
346                                 continue;
347
348                         /* Parse the fields on the line */
349                         if(sscanf(buffer, "0x%x%n", &x, &cnt) <= 0)
350                                 exit(1);
351                         if(!(x < 0x100))
352                                 exit(1);
353                         byteValue = x;
354                         while (buffer[cnt] == ' ' || buffer[cnt] == '\t')
355                                 cnt++;
356                         if(sscanf(buffer+cnt, "0x%x", &x) != 1)
357                                 continue;
358                         if(!(x < 0x10000))
359                                 exit(1);
360                         posn = x;
361                         level = 0;
362                 }
363                 else
364                         continue;
365
366                 /* Update the byte->char mapping table */
367                 if(level < byteToCharLevel[byteValue])
368                 {
369                         byteToCharLevel[byteValue] = level;
370                         byteToChar[byteValue] = (unsigned)posn;
371                 }
372
373                 /* Update the char->byte mapping table */
374                 charToByte[posn] = (int)byteValue;
375         }
376 }
377
378 #define COPYRIGHT_MSG \
379 " *\n" \
380 " * Copyright (c) 2002  Southern Storm Software, Pty Ltd\n" \
381 " *\n" \
382 " * Permission is hereby granted, free of charge, to any person obtaining\n" \
383 " * a copy of this software and associated documentation files (the \"Software\"),\n" \
384 " * to deal in the Software without restriction, including without limitation\n" \
385 " * the rights to use, copy, modify, merge, publish, distribute, sublicense,\n" \
386 " * and/or sell copies of the Software, and to permit persons to whom the\n" \
387 " * Software is furnished to do so, subject to the following conditions:\n" \
388 " *\n" \
389 " * The above copyright notice and this permission notice shall be included\n" \
390 " * in all copies or substantial portions of the Software.\n" \
391 " *\n" \
392 " * THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS\n" \
393 " * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n" \
394 " * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n" \
395 " * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR\n" \
396 " * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,\n" \
397 " * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR\n" \
398 " * OTHER DEALINGS IN THE SOFTWARE.\n" \
399 " */\n\n"
400
401 /*
402  * Print the header for the current code page definition.
403  */
404 static void printHeader(void)
405 {
406         printf("/*\n * CP%d.cs - %s code page.\n", codePage, name);
407         fputs(COPYRIGHT_MSG, stdout);
408         printf("// Generated from \"%s\".\n\n", filename);
409         printf("namespace I18N.%s\n{\n\n", region);
410         printf("using System;\n");
411         printf("using System.Text;\n");
412         printf("using I18N.Common;\n\n");
413         printf("[Serializable]\n");
414         printf("public class CP%d : ByteEncoding\n{\n", codePage);
415         printf("\tpublic CP%d()\n", codePage);
416         printf("\t\t: base(%d, ToChars, \"%s\",\n", codePage, name);
417         printf("\t\t       \"%s\", \"%s\", \"%s\",\n",
418                bodyName, headerName, webName);
419         printf("\t\t       %s, %s, %s, %s, %d)\n",
420                    (isBrowserDisplay ? "true" : "false"),
421                    (isBrowserSave ? "true" : "false"),
422                    (isMailNewsDisplay ? "true" : "false"),
423                    (isMailNewsSave ? "true" : "false"),
424                    windowsCodePage);
425         printf("\t{}\n\n");
426 }
427
428 /*
429  * Print an encoding name, adjusted to look like a type name.
430  */
431 static void printEncodingName(const char *name)
432 {
433         while(*name != '\0')
434         {
435                 if(*name >= 'A' && *name <= 'Z')
436                 {
437                         putc(*name - 'A' + 'a', stdout);
438                 }
439                 else if(*name == '-')
440                 {
441                         putc('_', stdout);
442                 }
443                 else
444                 {
445                         putc(*name, stdout);
446                 }
447                 ++name;
448         }
449 }
450
451 /*
452  * Print the footer for the current code page definition.
453  */
454 static void printFooter(void)
455 {
456         printf("}; // class CP%d\n\n", codePage);
457         printf("[Serializable]\n");
458         printf("public class ENC");
459         printEncodingName(webName);
460         printf(" : CP%d\n{\n", codePage);
461         printf("\tpublic ENC");
462         printEncodingName(webName);
463         printf("() : base() {}\n\n");
464         printf("}; // class ENC");
465         printEncodingName(webName);
466         printf("\n\n}; // namespace I18N.%s\n", region);
467 }
468
469 /*
470  * Print the byte->char conversion table.
471  */
472 static void printByteToChar(void)
473 {
474         int posn;
475         printf("\tprivate static readonly char[] ToChars = {");
476         for(posn = 0; posn < 256; ++posn)
477         {
478                 if((posn % 6) == 0)
479                 {
480                         printf("\n\t\t");
481                 }
482                 printf("'\\u%04X', ", byteToChar[posn]);
483         }
484         printf("\n\t};\n\n");
485 }
486
487 /*
488  * Print a "switch" statement that converts "ch" from
489  * a character value into a byte value.
490  */
491 static void printConvertSwitch(int forString)
492 {
493         unsigned long directLimit;
494         unsigned long posn;
495         unsigned long posn2;
496         unsigned long rangeSize;
497         int haveDirect;
498         int haveFullWidth;
499
500         /* Find the limit of direct byte mappings */
501         directLimit = 0;
502         while(directLimit < 256 && charToByte[directLimit] == (int)directLimit)
503         {
504                 ++directLimit;
505         }
506
507         /* Determine if we have the full-width Latin1 mappings, which
508            we can optimise in the default case of the switch */
509         haveFullWidth = 1;
510         for(posn = 0xFF01; posn <= 0xFF5E; ++posn)
511         {
512                 if((charToByte[posn] - 0x21) != (int)(posn - 0xFF01))
513                 {
514                         haveFullWidth = 0;
515                 }
516         }
517
518         /* Print the switch header.  The "if" is an optimisation
519            to ignore the common case of direct ASCII mappings */
520         printf("\t\t\tif(ch >= %lu) switch(ch)\n", directLimit);
521         printf("\t\t\t{\n");
522
523         /* Handle all direct byte mappings above the direct limit */
524         haveDirect = 0;
525         for(posn = directLimit; posn < 256; ++posn)
526         {
527                 if(charToByte[posn] == (int)posn)
528                 {
529                         haveDirect = 1;
530                         printf("\t\t\t\tcase 0x%04lX:\n", posn);
531                 }
532         }
533         if(haveDirect)
534         {
535                 printf("\t\t\t\t\tbreak;\n");
536         }
537
538         /* Handle the indirect mappings */
539         for(posn = 0; posn < 65536; ++posn)
540         {
541                 if(haveFullWidth && posn >= 0xFF01 && posn <= 0xFF5E)
542                 {
543                         /* Handle full-width Latin1 conversions later */
544                         continue;
545                 }
546                 if(charToByte[posn] != (int)posn &&
547                    charToByte[posn] != -1)
548                 {
549                         /* See if we have a run of 4 or more characters that
550                            can be mapped algorithmically to some other range */
551                         rangeSize = 1;
552                         for(posn2 = posn + 1; posn2 < 65536; ++posn2)
553                         {
554                                 if(charToByte[posn2] == (int)posn2 ||
555                                    charToByte[posn2] == -1)
556                                 {
557                                         break;
558                                 }
559                                 if((charToByte[posn2] - charToByte[posn]) !=
560                                    (int)(posn2 - posn))
561                                 {
562                                         break;
563                                 }
564                                 ++rangeSize;
565                         }
566                         if(rangeSize >= 4)
567                         {
568                                 /* Output a range mapping for the characters */
569                                 for(posn2 = posn; posn2 < (posn + rangeSize); ++posn2)
570                                 {
571                                         printf("\t\t\t\tcase 0x%04lX:\n", posn2);
572                                 }
573                                 posn += rangeSize - 1;
574                                 if(((long)posn) >= (long)(charToByte[posn]))
575                                 {
576                                         printf("\t\t\t\t\tch -= 0x%04lX;\n",
577                                                    (long)(posn - charToByte[posn]));
578                                 }
579                                 else
580                                 {
581                                         printf("\t\t\t\t\tch += 0x%04lX;\n",
582                                                    (long)(charToByte[posn] - posn));
583                                 }
584                                 printf("\t\t\t\t\tbreak;\n");
585                         }
586                         else
587                         {
588                                 /* Use a simple non-algorithmic mapping */
589                                 printf("\t\t\t\tcase 0x%04lX: ch = 0x%02X; break;\n",
590                                            posn, (unsigned)(charToByte[posn]));
591                         }
592                 }
593         }
594
595         /* Print the switch footer */
596         if(!haveFullWidth)
597         {
598                 if(forString)
599                         printf("\t\t\t\tdefault: ch = 0x3F; break;\n");
600                 else {
601                         printf("\t\t\t\tdefault:\n");
602                         printf("#if NET_2_0\n");
603                         printf("\t\t\t\t\tHandleFallback (ref buffer, chars, ref charIndex, ref charCount, bytes, ref byteIndex, ref byteCount);\n");
604                         printf("#else\n");
605                         printf("\t\t\t\t\t\tch = 0x3F;\n");
606                         printf("#endif\n");
607                         printf("\t\t\t\t\tbreak;\n");
608                 }
609         }
610         else
611         {
612                 printf("\t\t\t\tdefault:\n");
613                 printf("\t\t\t\t{\n");
614                 printf("\t\t\t\t\tif(ch >= 0xFF01 && ch <= 0xFF5E)\n");
615                 printf("\t\t\t\t\t\tch -= 0xFEE0;\n");
616                 printf("\t\t\t\t\telse\n");
617                 if(forString) /* this is basically meaningless, just to make diff for unused code minimum */
618                         printf("\t\t\t\t\t\tch = 0x3F;\n");
619                 else {
620                         printf("#if NET_2_0\n");
621                         printf("\t\t\t\t\t\tHandleFallback (ref buffer, chars, ref charIndex, ref charCount, bytes, ref byteIndex, ref byteCount);\n");
622                         printf("#else\n");
623                         printf("\t\t\t\t\t\tch = 0x3F;\n");
624                         printf("#endif\n");
625                 }
626                 printf("\t\t\t\t}\n");
627                 printf("\t\t\t\tbreak;\n");
628         }
629         printf("\t\t\t}\n");
630 }
631
632 /*
633  * Print the char->byte conversion methods.
634  */
635 static void printCharToByte(void)
636 {
637         /* Print the conversion method for character buffers */
638         printf("\tprotected unsafe override void ToBytes(char* chars, int charCount,\n");
639         printf("\t                                byte* bytes, int byteCount)\n");
640         printf("\t{\n");
641         printf("\t\tint ch;\n");
642         printf("\t\tint charIndex = 0;\n");
643         printf("\t\tint byteIndex = 0;\n");
644         printf("#if NET_2_0\n");
645         printf("\t\tEncoderFallbackBuffer buffer = null;\n");
646         printf("#endif\n");
647         printf("\t\twhile(charCount > 0)\n");
648         printf("\t\t{\n");
649         printf("\t\t\tch = (int)(chars[charIndex++]);\n");
650         printConvertSwitch(0);
651         printf("\t\t\tbytes[byteIndex++] = (byte)ch;\n");
652         printf("\t\t\t--charCount;\n");
653         printf("\t\t\t--byteCount;\n");
654         printf("\t\t}\n");
655         printf("\t}\n\n");
656
657         /* Print the conversion method for string buffers */
658         printf("\t/*\n");
659         printf("\tprotected override void ToBytes(String s, int charIndex, int charCount,\n");
660         printf("\t                                byte[] bytes, int byteIndex)\n");
661         printf("\t{\n");
662         printf("\t\tint ch;\n");
663         printf("\t\twhile(charCount > 0)\n");
664         printf("\t\t{\n");
665         printf("\t\t\tch = (int)(s[charIndex++]);\n");
666         printConvertSwitch(1);
667         printf("\t\t\tbytes[byteIndex++] = (byte)ch;\n");
668         printf("\t\t\t--charCount;\n");
669         printf("\t\t}\n");
670         printf("\t}\n");
671         printf("\t*/\n\n");
672 }