2005-09-25 Atsushi Enomoto <atsushi@ximian.com>
[mono.git] / mcs / class / I18N / CJK / ISO2022JP.cs
1 using System;
2 using System.Text;
3
4 namespace I18N.CJK
5 {
6         // FIXME:
7         // find out what is the difference between 50220, 50221 and 50222.
8
9         public class CP50220 : ISO2022JPEncoding
10         {
11                 public CP50220 ()
12                         : base (true, true)
13                 {
14                 }
15
16                 public override int CodePage {
17                         get { return 50220; }
18                 }
19
20                 public override string EncodingName {
21                         get { return "Japanese (JIS)"; }
22                 }
23         }
24
25         public class CP50221 : ISO2022JPEncoding
26         {
27                 public CP50221 ()
28                         : base (false, true)
29                 {
30                 }
31
32                 public override int CodePage {
33                         get { return 50221; }
34                 }
35
36                 public override string EncodingName {
37                         get { return "Japanese (JIS-Allow 1 byte Kana)"; }
38                 }
39         }
40
41         public class CP50222 : ISO2022JPEncoding
42         {
43                 public CP50222 ()
44                         : base (true, true)
45                 {
46                 }
47
48                 public override int CodePage {
49                         get { return 50222; }
50                 }
51
52                 public override string EncodingName {
53                         get { return "Japanese (JIS-Allow 1 byte Kana - SO/SI)"; }
54                 }
55         }
56
57         public class ISO2022JPEncoding : Encoding
58         {
59                 static JISConvert convert = JISConvert.Convert;
60
61                 public ISO2022JPEncoding (bool allow1ByteKana, bool allowShiftIO)
62                 {
63                         this.allow_1byte_kana = allow1ByteKana;
64                         this.allow_shift_io = allowShiftIO;
65                 }
66
67                 readonly bool allow_1byte_kana, allow_shift_io;
68
69                 public override string BodyName {
70                         get { return "iso-2022-jp"; }
71                 }
72
73                 public override string HeaderName {
74                         get { return "iso-2022-jp"; }
75                 }
76
77                 public override string WebName {
78                         get { return "csISO2022JP"; }
79                 }
80
81                 public override int GetMaxByteCount (int charCount)
82                 {
83                         // ESC w ESC s ESC w ... (even number) ESC s
84                         return charCount / 2 * 5 + 4;
85                 }
86
87                 public override int GetMaxCharCount (int byteCount)
88                 {
89                         // no escape sequence
90                         return byteCount;
91                 }
92
93                 public override int GetByteCount (char [] chars, int charIndex, int charCount)
94                 {
95                         return new ISO2022JPEncoder (allow_1byte_kana, allow_shift_io).GetByteCount (chars, charIndex, charCount, true);
96                 }
97
98                 public override int GetBytes (char [] chars, int charIndex, int charCount, byte [] bytes, int byteIndex)
99                 {
100                         return new ISO2022JPEncoder (allow_1byte_kana, allow_shift_io).GetBytes (chars, charIndex, charCount, bytes, byteIndex, true);
101                 }
102
103                 public override int GetCharCount (byte [] bytes, int index, int count)
104                 {
105                         return new ISO2022JPDecoder (allow_1byte_kana, allow_shift_io).GetCharCount (bytes, index, count);
106                 }
107
108                 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
109                 {
110                         return new ISO2022JPDecoder (allow_1byte_kana, allow_shift_io).GetChars (bytes, byteIndex, byteCount, chars, charIndex);
111                 }
112         }
113
114         internal enum ISO2022JPMode {
115                 ASCII,
116                 JISX0208,
117                 JISX0201
118         }
119
120         internal class ISO2022JPEncoder : Encoder
121         {
122                 static JISConvert convert = JISConvert.Convert;
123
124                 readonly bool allow_1byte_kana, allow_shift_io;
125
126                 ISO2022JPMode m = ISO2022JPMode.ASCII;
127
128                 public ISO2022JPEncoder (bool allow1ByteKana, bool allowShiftIO)
129                 {
130                         this.allow_1byte_kana = allow1ByteKana;
131                         this.allow_shift_io = allowShiftIO;
132                 }
133
134                 public override int GetByteCount (char [] chars, int charIndex, int charCount, bool flush)
135                 {
136                         int end = charIndex + charCount;
137                         int value;
138                         int byteCount = 0;
139
140                         for (int i = charIndex; i < end; i++) {
141                                 char ch = chars [i];
142                                 if (ch >= 0x2010 && ch <= 0x9FA5)
143                                 {
144                                         if (m != ISO2022JPMode.JISX0208)
145                                                 byteCount += 3;
146                                         m = ISO2022JPMode.JISX0208;
147                                         // This range contains the bulk of the CJK set.
148                                         value = (ch - 0x2010) * 2;
149                                         value = ((int)(convert.cjkToJis[value])) |
150                                                         (((int)(convert.cjkToJis[value + 1])) << 8);
151                                 } else if (ch >= 0xFF01 && ch <= 0xFF60) {
152                                         if (m != ISO2022JPMode.JISX0208)
153                                                 byteCount += 3;
154                                         m = ISO2022JPMode.JISX0208;
155
156                                         // This range contains extra characters,
157                                         value = (ch - 0xFF01) * 2;
158                                         value = ((int)(convert.extraToJis[value])) |
159                                                         (((int)(convert.extraToJis[value + 1])) << 8);
160                                 } else if(ch >= 0xFF60 && ch <= 0xFFA0) {
161                                         if (m != ISO2022JPMode.JISX0201)
162                                                 byteCount += 3;
163                                         m = ISO2022JPMode.JISX0201;
164                                         value = ch - 0xFF60 + 0xA0;
165                                 } else if (ch < 128) {
166                                         if (m != ISO2022JPMode.ASCII)
167                                                 byteCount += 3;
168                                         m = ISO2022JPMode.ASCII;
169                                         value = (int) ch;
170                                 } else
171                                         // skip non-convertible character
172                                         continue;
173
174                                 if (value > 0x100)
175                                         byteCount += 2;
176                                 else
177                                         byteCount++;
178                         }
179                         // must end in ASCII mode
180                         if (flush && m != ISO2022JPMode.ASCII) {
181                                 byteCount += 3;
182                                 m = ISO2022JPMode.ASCII;
183                         }
184                         return byteCount;
185                 }
186
187                 // returns false if it failed to add required ESC.
188                 private bool SwitchMode (byte [] bytes, ref int byteIndex,
189                         ISO2022JPMode cur, ISO2022JPMode next)
190                 {
191                         if (cur == next)
192                                 return true;
193                         if (bytes.Length <= byteIndex + 3)
194                                 return false;
195                         bytes [byteIndex++] = 0x1B;
196                         bytes [byteIndex++] = (byte) (next == ISO2022JPMode.JISX0208 ? 0x24 : 0x28);
197                         bytes [byteIndex++] = (byte) (next == ISO2022JPMode.JISX0201 ? 0x49 : 0x42);
198                         return true;
199                 }
200
201                 public override int GetBytes (char [] chars, int charIndex, int charCount, byte [] bytes, int byteIndex, bool flush)
202                 {
203                         bool wide = false;
204                         int start = byteIndex;
205
206                         int end = charIndex + charCount;
207                         int value;
208
209                         for (int i = charIndex; i < end &&
210                                 byteIndex < bytes.Length + (wide ? 1 : 0); i++) {
211                                 char ch = chars [i];
212                                 if (ch >= 0x2010 && ch <= 0x9FA5)
213                                 {
214                                         if (!SwitchMode (bytes, ref byteIndex, m, ISO2022JPMode.JISX0208))
215                                                 break;
216                                         m = ISO2022JPMode.JISX0208;
217                                         // This range contains the bulk of the CJK set.
218                                         value = (ch - 0x2010) * 2;
219                                         value = ((int)(convert.cjkToJis[value])) |
220                                                         (((int)(convert.cjkToJis[value + 1])) << 8);
221                                 } else if (ch >= 0xFF01 && ch <= 0xFF60) {
222                                         if (!SwitchMode (bytes, ref byteIndex, m, ISO2022JPMode.JISX0208))
223                                                 break;
224                                         m = ISO2022JPMode.JISX0208;
225
226                                         // This range contains extra characters,
227                                         value = (ch - 0xFF01) * 2;
228                                         value = ((int)(convert.extraToJis[value])) |
229                                                         (((int)(convert.extraToJis[value + 1])) << 8);
230                                 } else if(ch >= 0xFF60 && ch <= 0xFFA0) {
231                                         if (!SwitchMode (bytes, ref byteIndex, m, ISO2022JPMode.JISX0201))
232                                                 break;
233                                         m = ISO2022JPMode.JISX0201;
234                                         value = ch - 0xFF60 + 0xA0;
235                                 } else if (ch < 128) {
236                                         if (!SwitchMode (bytes, ref byteIndex, m, ISO2022JPMode.ASCII))
237                                                 break;
238                                         m = ISO2022JPMode.ASCII;
239                                         value = (int) ch;
240                                 } else
241                                         // skip non-convertible character
242                                         continue;
243
244 //Console.WriteLine ("{0:X04} : {1:x02} {2:x02}", v, (int) v / 94 + 33, v % 94 + 33);
245                                 if (value > 0x100) {
246                                         value -= 0x0100;
247                                         bytes [byteIndex++] = (byte) (value / 94 + 33);
248                                         bytes [byteIndex++] = (byte) (value % 94 + 33);
249                                 }
250                                 else
251                                         bytes [byteIndex++] = (byte) value;
252                         }
253                         if (flush) {
254                                 // must end in ASCII mode
255                                 SwitchMode (bytes, ref byteIndex, m, ISO2022JPMode.ASCII);
256                                 m = ISO2022JPMode.ASCII;
257                         }
258                         return byteIndex - start;
259                 }
260         }
261
262         internal class ISO2022JPDecoder : Decoder
263         {
264                 static JISConvert convert = JISConvert.Convert;
265
266                 readonly bool allow_1byte_kana, allow_shift_io;
267
268                 public ISO2022JPDecoder (bool allow1ByteKana, bool allowShiftIO)
269                 {
270                         this.allow_1byte_kana = allow1ByteKana;
271                         this.allow_shift_io = allowShiftIO;
272                 }
273
274                 // GetCharCount
275                 public override int GetCharCount (byte [] bytes, int index, int count)
276                 {
277                         int ret = 0;
278
279                         int end = index + count;
280                         for (int i = index; i < end; i++) {
281                                 if (bytes [i] != 0x1B) {
282                                         ret++;
283                                         continue;
284                                 } else {
285                                         if (i + 2 >= end)
286                                                 break; // incomplete escape sequence
287                                         i++;
288                                         if (bytes [i] != 0x24 &&
289                                                 bytes [i] != 0x28)
290                                                 throw new ArgumentException ("Unexpected ISO-2022-JP escape sequence.");
291                                         i++;
292                                         if (bytes [i] != 0x42)
293                                                 throw new ArgumentException ("Unexpected ISO-2022-JP escape sequence.");
294                                 }
295                         }
296
297                         return ret;
298                 }
299
300                 private char ToChar (int value)
301                 {
302                         value <<= 1;
303                         return value >= convert.jisx0208ToUnicode.Length ? '?' :
304                                 (char) (((int) (convert.jisx0208ToUnicode [value])) |
305                                         (((int) (convert.jisx0208ToUnicode [value + 1])) << 8));
306                 }
307
308                 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
309                 {
310                         ISO2022JPMode m = ISO2022JPMode.ASCII;
311                         int start = charIndex;
312                         int end = byteIndex + byteCount;
313                         for (int i = byteIndex; i < end && charIndex < chars.Length; i++) {
314                                 if (bytes [i] != 0x1B) {
315                                         if (m == ISO2022JPMode.JISX0208) {
316                                                 if (i + 1 == end)
317                                                         break; // incomplete head of wide char
318
319                                                 // am so lazy, so reusing jis2sjis and 
320                                                 int s1 = ((bytes [i] - 1) >> 1) + ((bytes [i] <= 0x5e) ? 0x71 : 0xb1);
321                                                 int s2 = bytes [i + 1] + (((bytes [i] & 1) != 0) ? 0x20 : 0x7e);
322                                                 int v = (s1 - 0x81) * 0xBC;
323                                                 v += s2 - 0x41;
324
325                                                 chars [charIndex++] = ToChar (v);
326                                                 i++;
327                                         }
328                                         else if (m == ISO2022JPMode.JISX0201)
329                                                 chars [charIndex++] = (char) (bytes [i] + 0xFF40);
330                                         // LAMESPEC: actually this should not
331                                         // be allowed when 1byte-kana is not
332                                         // allowed, but MS.NET seems to allow
333                                         // it in any mode.
334                                         else if (bytes [i] > 0xA0 && bytes [i] < 0xE0) // half-width Katakana
335                                                 chars [charIndex++] = (char) (bytes [i] - 0xA0 + 0xFF60);
336                                         else
337                                                 chars [charIndex++] = (char) bytes [i];
338                                         continue;
339                                 } else {
340                                         if (i + 2 >= end)
341                                                 break; // incomplete escape sequence
342                                         i++;
343                                         bool wide = false;
344                                         if (bytes [i] == 0x24)
345                                                 wide = true;
346                                         else if (bytes [i] == 0x28)
347                                                 wide = false;
348                                         else
349                                                 throw new ArgumentException ("Unexpected ISO-2022-JP escape sequence.");
350                                         i++;
351                                         if (bytes [i] == 0x42)
352                                                 m = wide ? ISO2022JPMode.JISX0208 : ISO2022JPMode.ASCII;
353                                         else if (bytes [i] == 0x49)
354                                                 m = ISO2022JPMode.JISX0201;
355                                         else
356                                                 throw new ArgumentException (String.Format ("Unexpected ISO-2022-JP escape sequence. Ended with 0x{0:X04}", bytes [i]));
357                                 }
358                         }
359
360                         return charIndex - start;
361                 }
362         }
363
364         public class ENCiso_2022_jp : CP50220
365         {
366                 public ENCiso_2022_jp () : base() {}
367
368         }; // class ENCiso_2022_jp
369 }