2 * uni2tab.c - Convert Unicode data files into CJK conversion tables.
4 * Copyright (c) 2002 Southern Storm Software, Pty Ltd
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
29 Required files from ftp.unicode.org: Unihan.txt, shiftjis.txt
38 * Forward declarations.
40 static void convertLine(char *buf);
41 static void convertSJISLine(char *buf);
42 static int createTables(void);
44 int main(int argc, char *argv[])
50 /* Load the relevant contents from the Unihan.txt file */
51 if((file = fopen("Unihan.txt", "r")) == NULL)
56 while(fgets(buffer, sizeof(buffer), file))
58 if(buffer[0] == 'U' && buffer[1] == '+')
60 convertLine(buffer + 2);
65 /* Load the relevant contents from the shiftjis.txt file,
66 to get mappings for non-CJK characters */
67 if((file = fopen("shiftjis.txt", "r")) == NULL)
69 perror("shiftjis.txt");
72 while(fgets(buffer, sizeof(buffer), file))
74 if(buffer[0] == '0' && buffer[1] == 'x')
76 convertSJISLine(buffer + 2);
81 /* Create the output tables */
82 error = createTables();
84 /* Clean up and exit */
89 * Parse a hexadecimal value. Returns the length
90 * of the value that was parsed.
92 static int parseHex(const char *buf, unsigned long *value)
97 while((ch = buf[len]) != '\0')
99 if(ch >= '0' && ch <= '9')
101 *value = *value * 16 + (unsigned long)(ch - '0');
103 else if(ch >= 'A' && ch <= 'F')
105 *value = *value * 16 + (unsigned long)(ch - 'A' + 10);
107 else if(ch >= 'a' && ch <= 'f')
109 *value = *value * 16 + (unsigned long)(ch - 'a' + 10);
121 * Parse "ku" and "ten" values from a buffer.
123 static void parseKuTen(const char *buf, int *ku, int *ten)
126 while(*buf >= '0' && *buf <= '9')
128 value = value * 10 + (*buf++ - '0');
137 static unsigned short jisx0208ToUnicode[94*94];
138 static unsigned short jisx0212ToUnicode[94*94];
139 static unsigned short unicodeToJis[65536];
140 static unsigned short greekToJis[0x451 - 0x0391 + 1];
141 static unsigned short extraToJis[0xFFEF - 0xFF01 + 1];
142 static unsigned long lowJis = 0xFFFF;
143 static unsigned long highJis = 0x0000;
146 * Process a JIS X 0208 sequence by ku and ten values.
148 static void processJis0208(unsigned long code, int ku, int ten)
150 int offset = (ku - 1) * 94 + (ten - 1);
151 jisx0208ToUnicode[offset] = (unsigned short)code;
152 unicodeToJis[code] = (unsigned short)(offset + 0x0100);
164 * Process a JIS X 0212 sequence by ku and ten values.
166 static void processJis0212(unsigned long code, int ku, int ten)
168 int offset = (ku - 1) * 94 + (ten - 1);
169 jisx0212ToUnicode[offset] = (unsigned short)code;
170 unicodeToJis[code] = (unsigned short)(offset + 0x8000);
182 * Convert an input line into table entries.
184 static void convertLine(char *buf)
190 /* Parse the hex name of the Unicode character */
191 buf += parseHex(buf, &code);
194 /* Cannot handle surrogate-based CJK characters yet */
198 /* Skip to the key name */
199 while(*buf != '\0' && *buf != 'k')
208 /* Extract the key name from the buffer */
210 while(*buf != '\0' && *buf != ' ' && *buf != '\t')
220 /* Skip to the value field */
221 while(*buf != '\0' && (*buf == ' ' || *buf == '\t' ||
222 *buf == '\r' || *buf == '\n'))
231 /* Determine what to do based on the key */
232 if(!strcmp(key, "kJis0"))
234 parseKuTen(buf, &ku, &ten);
235 processJis0208(code, ku, ten);
237 else if(!strcmp(key, "kJis1"))
239 parseKuTen(buf, &ku, &ten);
240 processJis0212(code, ku, ten);
245 * Convert a line from the "shiftjis.txt" file.
247 static void convertSJISLine(char *buf)
254 /* Read the Shift-JIS code point */
255 buf += parseHex(buf, &sjis);
260 while(*buf != '\0' && (*buf == ' ' || *buf == '\t' ||
261 *buf == '\r' || *buf == '\n'))
265 if(*buf != '0' || buf[1] != 'x')
271 /* Read the Unicode code point */
272 buf += parseHex(buf, &code);
274 /* Convert the Shift-JIS code point into a JIS kuten value */
275 ch1 = (int)(sjis >> 8);
276 ch2 = (int)(sjis & 0xFF);
277 if(ch1 >= 0x81 && ch1 <= 0x9F)
279 offset = (ch1 - 0x81) * 0xBC;
281 else if(ch1 >= 0xE0 && ch1 <= 0xEF)
283 offset = (ch1 - 0xE0 + (0xA0 - 0x81)) * 0xBC;
287 /* Invalid first byte */
290 if(ch2 >= 0x40 && ch2 <= 0x7E)
292 offset += (ch2 - 0x40);
294 else if(ch2 >= 0x80 && ch2 <= 0xFC)
296 offset += (ch2 - 0x80 + 0x3F);
300 /* Invalid second byte */
304 /* Process the kuten value */
305 if(code >= 0x0391 && code <= 0x0451)
308 greekToJis[code - 0x0391] = (unsigned short)(offset + 0x0100);
309 /* This is required to decode Extra subset to Unicode!! */
310 jisx0208ToUnicode[offset] = (unsigned short)code;
312 else if(code >= 0xFF01 && code <= 0xFFEF)
315 extraToJis[code - 0xFF01] = (unsigned short)(offset + 0x0100);
316 /* This is required to decode Extra subset to Unicode!! */
317 jisx0208ToUnicode[offset] = (unsigned short)code;
319 else if(code >= 0x0100 && code < 0x4E00)
321 /* Non-CJK characters within JIS */
322 processJis0208(code, (offset / 94) + 1, (offset % 94) + 1);
327 * Write a section header.
329 static void writeSection(FILE *file, unsigned long num, unsigned long size)
331 putc((int)(num & 0xFF), file);
332 putc((int)((num >> 8) & 0xFF), file);
333 putc((int)((num >> 16) & 0xFF), file);
334 putc((int)((num >> 24) & 0xFF), file);
335 putc((int)(size & 0xFF), file);
336 putc((int)((size >> 8) & 0xFF), file);
337 putc((int)((size >> 16) & 0xFF), file);
338 putc((int)((size >> 24) & 0xFF), file);
342 * Write an array of 16-bit data values.
344 static void writeData(FILE *file, unsigned short *data, unsigned long size)
348 putc((int)(*data & 0xFF), file);
349 putc((int)((*data >> 8) & 0xFF), file);
356 * Section numbers for the JIS table.
358 #define JISX0208_To_Unicode 1
359 #define JISX0212_To_Unicode 2
361 #define Greek_To_JIS 4
362 #define Extra_To_JIS 5
365 * Write the JIS table file.
367 static void writeJis(FILE *file)
371 /* Write the JIS X 0208 to Unicode conversion table */
372 writeSection(file, JISX0208_To_Unicode, 94 * 94 * 2);
373 writeData(file, jisx0208ToUnicode, 94 * 94);
375 /* Write the JIS X 0212 to Unicode conversion table */
376 writeSection(file, JISX0212_To_Unicode, 94 * 94 * 2);
377 writeData(file, jisx0212ToUnicode, 94 * 94);
379 /* Write the Unicode to JIS conversion table */
380 size = highJis - lowJis + 1;
381 writeSection(file, CJK_To_JIS, size * 2);
382 writeData(file, unicodeToJis + lowJis, size);
383 printf("JIS: U+%04lX to U+%04lX\n", lowJis, highJis);
385 /* Write the Greek to JIS conversion table */
386 writeSection(file, Greek_To_JIS, sizeof(greekToJis));
387 writeData(file, greekToJis, sizeof(greekToJis) / 2);
389 /* Write the Extra to JIS conversion table */
390 writeSection(file, Extra_To_JIS, sizeof(extraToJis));
391 writeData(file, extraToJis, sizeof(extraToJis) / 2);
395 * Create all of the tables that we need based on the Unihan.txt file.
397 static int createTables(void)
401 /* Create the JIS conversion table */
402 if((file = fopen("jis.table", "wb")) == NULL)
404 if((file = fopen("jis.table", "wb")) == NULL)