merge -r 60439:60440
[mono.git] / mcs / class / I18N / CJK / ISO2022JP.cs
1 //
2 // ISO2022JP.cs
3 //
4 // Author:
5 //      Atsushi Enomoto  <atsushi@ximian.com>
6 //
7 using System;
8 using System.Text;
9 using I18N.Common;
10
11 namespace I18N.CJK
12 {
13         [Serializable]
14         public class CP50220 : ISO2022JPEncoding
15         {
16                 public CP50220 ()
17                         : base (50220, false, false)
18                 {
19                 }
20
21                 public override string EncodingName {
22                         get { return "Japanese (JIS)"; }
23                 }
24         }
25
26         [Serializable]
27         public class CP50221 : ISO2022JPEncoding
28         {
29                 public CP50221 ()
30                         : base (50221, true, false)
31                 {
32                 }
33
34                 public override string EncodingName {
35                         get { return "Japanese (JIS-Allow 1 byte Kana)"; }
36                 }
37         }
38
39         [Serializable]
40         public class CP50222 : ISO2022JPEncoding
41         {
42                 public CP50222 ()
43                         : base (50222, true, true)
44                 {
45                 }
46
47                 public override string EncodingName {
48                         get { return "Japanese (JIS-Allow 1 byte Kana - SO/SI)"; }
49                 }
50         }
51
52         [Serializable]
53         public class ISO2022JPEncoding : MonoEncoding
54         {
55                 public ISO2022JPEncoding (int codePage, bool allow1ByteKana, bool allowShiftIO)
56                         : base (codePage, 932)
57                 {
58                         this.allow_1byte_kana = allow1ByteKana;
59                         this.allow_shift_io = allowShiftIO;
60                 }
61
62                 readonly bool allow_1byte_kana, allow_shift_io;
63
64                 public override string BodyName {
65                         get { return "iso-2022-jp"; }
66                 }
67
68                 public override string HeaderName {
69                         get { return "iso-2022-jp"; }
70                 }
71
72                 public override string WebName {
73                         get { return "csISO2022JP"; }
74                 }
75
76                 public override int GetMaxByteCount (int charCount)
77                 {
78                         // ESC w ESC s ESC w ... (even number) ESC s
79                         return charCount / 2 * 5 + 4;
80                 }
81
82                 public override int GetMaxCharCount (int byteCount)
83                 {
84                         // no escape sequence
85                         return byteCount;
86                 }
87
88                 public override int GetByteCount (char [] chars, int charIndex, int charCount)
89                 {
90                         return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetByteCount (chars, charIndex, charCount, true);
91                 }
92
93                 public unsafe override int GetByteCountImpl (char* chars, int count)
94                 {
95                         return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetByteCountImpl (chars, count, true);
96                 }
97
98                 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount)
99                 {
100                         return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetBytesImpl (chars, charCount, bytes, byteCount, true);
101                 }
102
103                 public override int GetCharCount (byte [] bytes, int index, int count)
104                 {
105                         return new ISO2022JPDecoder (allow_1byte_kana, allow_shift_io).GetCharCount (bytes, index, count);
106                 }
107
108                 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
109                 {
110                         return new ISO2022JPDecoder (allow_1byte_kana, allow_shift_io).GetChars (bytes, byteIndex, byteCount, chars, charIndex);
111                 }
112         }
113
114         internal enum ISO2022JPMode {
115                 ASCII,
116                 JISX0208,
117                 JISX0201
118         }
119
120         internal class ISO2022JPEncoder : MonoEncoder
121         {
122                 static JISConvert convert = JISConvert.Convert;
123
124                 readonly bool allow_1byte_kana, allow_shift_io;
125
126                 ISO2022JPMode m = ISO2022JPMode.ASCII;
127                 bool shifted_in_count, shifted_in_conv;
128
129                 public ISO2022JPEncoder (MonoEncoding owner, bool allow1ByteKana, bool allowShiftIO)
130                         : base (owner)
131                 {
132                         this.allow_1byte_kana = allow1ByteKana;
133                         this.allow_shift_io = allowShiftIO;
134                 }
135
136                 public unsafe override int GetByteCountImpl (char* chars, int charCount, bool flush)
137                 {
138                         int charIndex = 0;
139                         int end = charCount;
140                         int value;
141                         int byteCount = 0;
142
143                         for (int i = charIndex; i < end; i++) {
144                                 char ch = chars [i];
145                                 // When half-kana is not allowed and it is
146                                 // actually in the input, convert to full width
147                                 // kana.
148                                 if (!allow_1byte_kana &&
149                                         ch >= 0xFF60 && ch <= 0xFFA0)
150                                         ch = full_width_map [ch - 0xFF60];
151
152                                 if (ch >= 0x2010 && ch <= 0x9FA5)
153                                 {
154                                         if (shifted_in_count) {
155                                                 shifted_in_count = false;
156                                                 byteCount++; // shift_out
157                                         }
158                                         if (m != ISO2022JPMode.JISX0208)
159                                                 byteCount += 3;
160                                         m = ISO2022JPMode.JISX0208;
161                                         // This range contains the bulk of the CJK set.
162                                         value = (ch - 0x2010) * 2;
163                                         value = ((int)(convert.cjkToJis[value])) |
164                                                         (((int)(convert.cjkToJis[value + 1])) << 8);
165                                 } else if (ch >= 0xFF01 && ch <= 0xFF60) {
166                                         if (shifted_in_count) {
167                                                 shifted_in_count = false;
168                                                 byteCount++;
169                                         }
170                                         if (m != ISO2022JPMode.JISX0208)
171                                                 byteCount += 3;
172                                         m = ISO2022JPMode.JISX0208;
173
174                                         // This range contains extra characters,
175                                         value = (ch - 0xFF01) * 2;
176                                         value = ((int)(convert.extraToJis[value])) |
177                                                         (((int)(convert.extraToJis[value + 1])) << 8);
178                                 } else if(ch >= 0xFF60 && ch <= 0xFFA0) {
179                                         if (allow_shift_io) {
180                                                 if (!shifted_in_count) {
181                                                         byteCount++;
182                                                         shifted_in_count = true;
183                                                 }
184                                         }
185                                         else if (m != ISO2022JPMode.JISX0201) {
186                                                 byteCount += 3;
187                                                 m = ISO2022JPMode.JISX0201;
188                                         }
189                                         value = ch - 0xFF60 + 0xA0;
190                                 } else if (ch < 128) {
191                                         if (shifted_in_count) {
192                                                 shifted_in_count = false;
193                                                 byteCount++;
194                                         }
195                                         if (m != ISO2022JPMode.ASCII)
196                                                 byteCount += 3;
197                                         m = ISO2022JPMode.ASCII;
198                                         value = (int) ch;
199                                 } else
200                                         // skip non-convertible character
201                                         continue;
202
203                                 if (value > 0x100)
204                                         byteCount += 2;
205                                 else
206                                         byteCount++;
207                         }
208                         // must end in ASCII mode
209                         if (flush) {
210                                 if (shifted_in_count) {
211                                         shifted_in_count = false;
212                                         byteCount++;
213                                 }
214                                 if (m != ISO2022JPMode.ASCII)
215                                         byteCount += 3;
216                                 m = ISO2022JPMode.ASCII;
217                         }
218                         return byteCount;
219                 }
220
221                 // returns false if it failed to add required ESC.
222                 private unsafe void SwitchMode (byte* bytes, ref int byteIndex,
223                         ref int byteCount, ref ISO2022JPMode cur, ISO2022JPMode next)
224                 {
225                         if (cur == next)
226                                 return;
227
228                         if (byteCount <= 3)
229                                 throw new ArgumentOutOfRangeException ("Insufficient byte buffer.");
230                         bytes [byteIndex++] = 0x1B;
231                         bytes [byteIndex++] = (byte) (next == ISO2022JPMode.JISX0208 ? 0x24 : 0x28);
232                         bytes [byteIndex++] = (byte) (next == ISO2022JPMode.JISX0201 ? 0x49 : 0x42);
233                         cur = next;
234                 }
235
236                 static readonly char [] full_width_map = new char [] {
237                         '\0', '\u3002', '\u300C', '\u300D', '\u3001', '\u30FB', // to nakaguro
238                         '\u30F2', '\u30A1', '\u30A3', '\u30A5', '\u30A7', '\u30A9', '\u30E3', '\u30E5', '\u30E7', '\u30C3', // to small tsu
239                         '\u30FC', '\u30A2', '\u30A4', '\u30A6', '\u30A8', '\u30AA', // A-O
240                         '\u30AB', '\u30AD', '\u30AF', '\u30B1', '\u30B3',
241                         '\u30B5', '\u30B7', '\u30B9', '\u30BB', '\u30BD',
242                         '\u30BF', '\u30C1', '\u30C4', '\u30C6', '\u30C8',
243                         '\u30C9', '\u30CA', '\u30CB', '\u30CC', '\u30CD',
244                         '\u30CF', '\u30D2', '\u30D5', '\u30D8', '\u30DB',
245                         '\u30DE', '\u30DF', '\u30E0', '\u30E1', '\u30E2',
246                         '\u30E4', '\u30E6', '\u30E8', // Ya-Yo
247                         '\u30E9', '\u30EA', '\u30EB', '\u30EC', '\u30ED',
248                         '\u30EF', '\u30F1', '\u30F3', '\u309B', '\u309C'};
249
250                 public unsafe override int GetBytesImpl (
251                         char* chars, int charCount,
252                         byte* bytes, int byteCount, bool flush)
253                 {
254                         int charIndex = 0;
255                         int byteIndex = 0;
256
257                         int start = byteIndex;
258                         int end = charIndex + charCount;
259                         int value;
260
261                         for (int i = charIndex; i < end; i++, charCount--) {
262                                 char ch = chars [i];
263
264                                 // When half-kana is not allowed and it is
265                                 // actually in the input, convert to full width
266                                 // kana.
267                                 if (!allow_1byte_kana &&
268                                         ch >= 0xFF60 && ch <= 0xFFA0)
269                                         ch = full_width_map [ch - 0xFF60];
270
271                                 if (ch >= 0x2010 && ch <= 0x9FA5)
272                                 {
273                                         if (shifted_in_conv) {
274                                                 bytes [byteIndex++] = 0x0F;
275                                                 shifted_in_conv = false;
276                                                 byteCount--;
277                                         }
278                                         switch (m) {
279                                         case ISO2022JPMode.JISX0208:
280                                                 break;
281                                         default:
282                                                 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0208);
283                                                 break;
284                                         }
285                                         // This range contains the bulk of the CJK set.
286                                         value = (ch - 0x2010) * 2;
287                                         value = ((int)(convert.cjkToJis[value])) |
288                                                         (((int)(convert.cjkToJis[value + 1])) << 8);
289                                 } else if (ch >= 0xFF01 && ch <= 0xFF60) {
290                                         if (shifted_in_conv) {
291                                                 bytes [byteIndex++] = 0x0F;
292                                                 shifted_in_conv = false;
293                                                 byteCount--;
294                                         }
295                                         switch (m) {
296                                         case ISO2022JPMode.JISX0208:
297                                                 break;
298                                         default:
299                                                 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0208);
300                                                 break;
301                                         }
302
303                                         // This range contains extra characters,
304                                         value = (ch - 0xFF01) * 2;
305                                         value = ((int)(convert.extraToJis[value])) |
306                                                         (((int)(convert.extraToJis[value + 1])) << 8);
307                                 } else if (ch >= 0xFF60 && ch <= 0xFFA0) {
308                                         // disallowed half-width kana is
309                                         // already converted to full-width kana
310                                         // so here we don't have to consider it.
311
312                                         if (allow_shift_io) {
313                                                 if (!shifted_in_conv) {
314                                                         bytes [byteIndex++] = 0x0E;
315                                                         shifted_in_conv = true;
316                                                         byteCount--;
317                                                 }
318                                         } else {
319                                                 switch (m) {
320                                                 case ISO2022JPMode.JISX0201:
321                                                         break;
322                                                 default:
323                                                         SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0201);
324                                                         break;
325                                                 }
326                                         }
327                                         value = ch - 0xFF40;
328                                 } else if (ch < 128) {
329                                         if (shifted_in_conv) {
330                                                 bytes [byteIndex++] = 0x0F;
331                                                 shifted_in_conv = false;
332                                                 byteCount--;
333                                         }
334                                         SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.ASCII);
335                                         value = (int) ch;
336                                 } else {
337 #if NET_2_0
338                                         HandleFallback (
339                                                 chars, ref i, ref charCount,
340                                                 bytes, ref byteIndex, ref byteCount);
341 #endif
342                                         // skip non-convertible character
343                                         continue;
344                                 }
345
346 //Console.WriteLine ("{0:X04} : {1:x02} {2:x02}", v, (int) v / 94 + 33, v % 94 + 33);
347                                 if (value > 0x100) {
348                                         value -= 0x0100;
349                                         bytes [byteIndex++] = (byte) (value / 94 + 33);
350                                         bytes [byteIndex++] = (byte) (value % 94 + 33);
351                                         byteCount -= 2;
352                                 }
353                                 else {
354                                         bytes [byteIndex++] = (byte) value;
355                                         byteCount--;
356                                 }
357                         }
358                         if (flush) {
359                                 // must end in ASCII mode
360                                 if (shifted_in_conv) {
361                                         bytes [byteIndex++] = 0x0F;
362                                         shifted_in_conv = false;
363                                         byteCount--;
364                                 }
365                                 if (m != ISO2022JPMode.ASCII)
366                                         SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.ASCII);
367                         }
368                         return byteIndex - start;
369                 }
370
371 #if NET_2_0
372                 public override void Reset ()
373                 {
374                         m = ISO2022JPMode.ASCII;
375                         shifted_in_conv = shifted_in_count = false;
376                 }
377 #endif
378         }
379
380         internal class ISO2022JPDecoder : Decoder
381         {
382                 static JISConvert convert = JISConvert.Convert;
383
384                 readonly bool allow_shift_io;
385                 ISO2022JPMode m = ISO2022JPMode.ASCII;
386                 bool shifted_in_conv, shifted_in_count;
387
388                 public ISO2022JPDecoder (bool allow1ByteKana, bool allowShiftIO)
389                 {
390                         this.allow_shift_io = allowShiftIO;
391                 }
392
393                 // GetCharCount
394                 public override int GetCharCount (byte [] bytes, int index, int count)
395                 {
396                         int ret = 0;
397
398                         int end = index + count;
399                         for (int i = index; i < end; i++) {
400                                 if (allow_shift_io) {
401                                         switch (bytes [i]) {
402                                         case 0x0F:
403                                                 shifted_in_count = false;
404                                                 continue;
405                                         case 0x0E:
406                                                 shifted_in_count = true;
407                                                 continue;
408                                         }
409                                 }
410                                 if (bytes [i] != 0x1B) {
411                                         if (!shifted_in_count && m == ISO2022JPMode.JISX0208) {
412                                                 if (i + 1 == end)
413                                                         break; // incomplete head of wide char
414                                                 else
415                                                         ret++;
416                                                 i++; // 2 byte char
417                                         }
418                                         else
419                                                 ret++; // half-kana or ASCII
420                                 } else {
421                                         if (i + 2 >= end)
422                                                 break; // incomplete escape sequence
423                                         i++;
424                                         bool wide = false;
425                                         if (bytes [i] == 0x24)
426                                                 wide = true;
427                                         else if (bytes [i] == 0x28)
428                                                 wide = false;
429                                         else
430                                                 throw new ArgumentException ("Unexpected ISO-2022-JP escape sequence.");
431                                         i++;
432                                         if (bytes [i] == 0x42)
433                                                 m = wide ? ISO2022JPMode.JISX0208 : ISO2022JPMode.ASCII;
434                                         else if (bytes [i] == 0x49)
435                                                 m = ISO2022JPMode.JISX0201;
436                                         else
437                                                 throw new ArgumentException (String.Format ("Unexpected ISO-2022-JP escape sequence. Ended with 0x{0:X04}", bytes [i]));
438                                 }
439                         }
440                         return ret;
441                 }
442
443                 private int ToChar (int value)
444                 {
445                         value <<= 1;
446                         return value + 1 >= convert.jisx0208ToUnicode.Length ?
447                                 -1 :
448                                 ((int) (convert.jisx0208ToUnicode [value])) |
449                                         (((int) (convert.jisx0208ToUnicode [value + 1])) << 8);
450                 }
451
452                 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
453                 {
454                         int start = charIndex;
455                         int end = byteIndex + byteCount;
456                         for (int i = byteIndex; i < end && charIndex < chars.Length; i++) {
457                                 if (allow_shift_io) {
458                                         switch (bytes [i]) {
459                                         case 0x0F:
460                                                 shifted_in_conv = false;
461                                                 continue;
462                                         case 0x0E:
463                                                 shifted_in_conv = true;
464                                                 continue;
465                                         }
466                                 }
467
468                                 if (bytes [i] != 0x1B) {
469                                         if (shifted_in_conv || m == ISO2022JPMode.JISX0201) {
470                                                 // half-kana
471                                                 if (bytes [i] < 0x60)
472                                                         chars [charIndex++] = (char) (bytes [i] + 0xFF40);
473                                                 else
474                                                         // invalid
475                                                         chars [charIndex++] = '?';
476                                         }
477                                         else if (m == ISO2022JPMode.JISX0208) {
478                                                 if (i + 1 == end)
479                                                         break; // incomplete head of wide char
480
481                                                 // am so lazy, so reusing jis2sjis
482                                                 int s1 = ((bytes [i] - 1) >> 1) + ((bytes [i] <= 0x5e) ? 0x71 : 0xb1);
483                                                 int s2 = bytes [i + 1] + (((bytes [i] & 1) != 0) ? 0x20 : 0x7e);
484                                                 int v = (s1 - 0x81) * 0xBC;
485                                                 v += s2 - 0x41;
486
487                                                 int ch = ToChar (v);
488                                                 if (ch < 0)
489                                                         chars [charIndex++] = '?';
490                                                 else
491                                                         chars [charIndex++] = (char) ch;
492                                                 i++;
493                                         }
494                                         // LAMESPEC: actually this should not
495                                         // be allowed when 1byte-kana is not
496                                         // allowed, but MS.NET seems to allow
497                                         // it in any mode.
498                                         else if (bytes [i] > 0xA0 && bytes [i] < 0xE0) // half-width Katakana
499                                                 chars [charIndex++] = (char) (bytes [i] - 0xA0 + 0xFF60);
500                                         else
501                                                 chars [charIndex++] = (char) bytes [i];
502                                         continue;
503                                 } else {
504                                         if (i + 2 >= end)
505                                                 break; // incomplete escape sequence
506                                         i++;
507                                         bool wide = false;
508                                         if (bytes [i] == 0x24)
509                                                 wide = true;
510                                         else if (bytes [i] == 0x28)
511                                                 wide = false;
512                                         else
513                                                 throw new ArgumentException ("Unexpected ISO-2022-JP escape sequence.");
514                                         i++;
515                                         if (bytes [i] == 0x42)
516                                                 m = wide ? ISO2022JPMode.JISX0208 : ISO2022JPMode.ASCII;
517                                         else if (bytes [i] == 0x49)
518                                                 m = ISO2022JPMode.JISX0201;
519                                         else
520                                                 throw new ArgumentException (String.Format ("Unexpected ISO-2022-JP escape sequence. Ended with 0x{0:X04}", bytes [i]));
521                                 }
522                         }
523
524                         return charIndex - start;
525                 }
526
527 #if NET_2_0
528                 public override void Reset ()
529                 {
530                         m = ISO2022JPMode.ASCII;
531                         shifted_in_count = shifted_in_conv = false;
532                 }
533 #endif
534         }
535
536         [Serializable]
537         public class ENCiso_2022_jp : CP50220
538         {
539                 public ENCiso_2022_jp () : base() {}
540
541         }; // class ENCiso_2022_jp
542 }