5 // Atsushi Enomoto <atsushi@ximian.com>
14 public class CP50220 : ISO2022JPEncoding
17 : base (50220, false, false)
21 public override string EncodingName {
22 get { return "Japanese (JIS)"; }
27 public class CP50221 : ISO2022JPEncoding
30 : base (50221, true, false)
34 public override string EncodingName {
35 get { return "Japanese (JIS-Allow 1 byte Kana)"; }
40 public class CP50222 : ISO2022JPEncoding
43 : base (50222, true, true)
47 public override string EncodingName {
48 get { return "Japanese (JIS-Allow 1 byte Kana - SO/SI)"; }
53 public class ISO2022JPEncoding : MonoEncoding
55 public ISO2022JPEncoding (int codePage, bool allow1ByteKana, bool allowShiftIO)
56 : base (codePage, 932)
58 this.allow_1byte_kana = allow1ByteKana;
59 this.allow_shift_io = allowShiftIO;
62 readonly bool allow_1byte_kana, allow_shift_io;
64 public override string BodyName {
65 get { return "iso-2022-jp"; }
68 public override string HeaderName {
69 get { return "iso-2022-jp"; }
72 public override string WebName {
73 get { return "csISO2022JP"; }
76 public override int GetMaxByteCount (int charCount)
78 // ESC w ESC s ESC w ... (even number) ESC s
79 return charCount / 2 * 5 + 4;
82 public override int GetMaxCharCount (int byteCount)
88 public override int GetByteCount (char [] chars, int charIndex, int charCount)
90 return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetByteCount (chars, charIndex, charCount, true);
93 public unsafe override int GetByteCountImpl (char* chars, int count)
95 return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetByteCountImpl (chars, count, true);
98 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount)
100 return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetBytesImpl (chars, charCount, bytes, byteCount, true);
103 public override int GetCharCount (byte [] bytes, int index, int count)
105 return new ISO2022JPDecoder (allow_1byte_kana, allow_shift_io).GetCharCount (bytes, index, count);
108 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
110 return new ISO2022JPDecoder (allow_1byte_kana, allow_shift_io).GetChars (bytes, byteIndex, byteCount, chars, charIndex);
114 internal enum ISO2022JPMode {
120 internal class ISO2022JPEncoder : MonoEncoder
122 static JISConvert convert = JISConvert.Convert;
124 readonly bool allow_1byte_kana, allow_shift_io;
126 ISO2022JPMode m = ISO2022JPMode.ASCII;
127 bool shifted_in_count, shifted_in_conv;
129 public ISO2022JPEncoder (MonoEncoding owner, bool allow1ByteKana, bool allowShiftIO)
132 this.allow_1byte_kana = allow1ByteKana;
133 this.allow_shift_io = allowShiftIO;
136 public unsafe override int GetByteCountImpl (char* chars, int charCount, bool flush)
143 for (int i = charIndex; i < end; i++) {
145 // When half-kana is not allowed and it is
146 // actually in the input, convert to full width
148 if (!allow_1byte_kana &&
149 ch >= 0xFF60 && ch <= 0xFFA0)
150 ch = full_width_map [ch - 0xFF60];
152 if (ch >= 0x2010 && ch <= 0x9FA5)
154 if (shifted_in_count) {
155 shifted_in_count = false;
156 byteCount++; // shift_out
158 if (m != ISO2022JPMode.JISX0208)
160 m = ISO2022JPMode.JISX0208;
161 // This range contains the bulk of the CJK set.
162 value = (ch - 0x2010) * 2;
163 value = ((int)(convert.cjkToJis[value])) |
164 (((int)(convert.cjkToJis[value + 1])) << 8);
165 } else if (ch >= 0xFF01 && ch <= 0xFF60) {
166 if (shifted_in_count) {
167 shifted_in_count = false;
170 if (m != ISO2022JPMode.JISX0208)
172 m = ISO2022JPMode.JISX0208;
174 // This range contains extra characters,
175 value = (ch - 0xFF01) * 2;
176 value = ((int)(convert.extraToJis[value])) |
177 (((int)(convert.extraToJis[value + 1])) << 8);
178 } else if(ch >= 0xFF60 && ch <= 0xFFA0) {
179 if (allow_shift_io) {
180 if (!shifted_in_count) {
182 shifted_in_count = true;
185 else if (m != ISO2022JPMode.JISX0201) {
187 m = ISO2022JPMode.JISX0201;
189 value = ch - 0xFF60 + 0xA0;
190 } else if (ch < 128) {
191 if (shifted_in_count) {
192 shifted_in_count = false;
195 if (m != ISO2022JPMode.ASCII)
197 m = ISO2022JPMode.ASCII;
200 // skip non-convertible character
208 // must end in ASCII mode
210 if (shifted_in_count) {
211 shifted_in_count = false;
214 if (m != ISO2022JPMode.ASCII)
216 m = ISO2022JPMode.ASCII;
221 // returns false if it failed to add required ESC.
222 private unsafe void SwitchMode (byte* bytes, ref int byteIndex,
223 ref int byteCount, ref ISO2022JPMode cur, ISO2022JPMode next)
229 throw new ArgumentOutOfRangeException ("Insufficient byte buffer.");
230 bytes [byteIndex++] = 0x1B;
231 bytes [byteIndex++] = (byte) (next == ISO2022JPMode.JISX0208 ? 0x24 : 0x28);
232 bytes [byteIndex++] = (byte) (next == ISO2022JPMode.JISX0201 ? 0x49 : 0x42);
236 static readonly char [] full_width_map = new char [] {
237 '\0', '\u3002', '\u300C', '\u300D', '\u3001', '\u30FB', // to nakaguro
238 '\u30F2', '\u30A1', '\u30A3', '\u30A5', '\u30A7', '\u30A9', '\u30E3', '\u30E5', '\u30E7', '\u30C3', // to small tsu
239 '\u30FC', '\u30A2', '\u30A4', '\u30A6', '\u30A8', '\u30AA', // A-O
240 '\u30AB', '\u30AD', '\u30AF', '\u30B1', '\u30B3',
241 '\u30B5', '\u30B7', '\u30B9', '\u30BB', '\u30BD',
242 '\u30BF', '\u30C1', '\u30C4', '\u30C6', '\u30C8',
243 '\u30C9', '\u30CA', '\u30CB', '\u30CC', '\u30CD',
244 '\u30CF', '\u30D2', '\u30D5', '\u30D8', '\u30DB',
245 '\u30DE', '\u30DF', '\u30E0', '\u30E1', '\u30E2',
246 '\u30E4', '\u30E6', '\u30E8', // Ya-Yo
247 '\u30E9', '\u30EA', '\u30EB', '\u30EC', '\u30ED',
248 '\u30EF', '\u30F1', '\u30F3', '\u309B', '\u309C'};
250 public unsafe override int GetBytesImpl (
251 char* chars, int charCount,
252 byte* bytes, int byteCount, bool flush)
257 int start = byteIndex;
258 int end = charIndex + charCount;
261 for (int i = charIndex; i < end; i++, charCount--) {
264 // When half-kana is not allowed and it is
265 // actually in the input, convert to full width
267 if (!allow_1byte_kana &&
268 ch >= 0xFF60 && ch <= 0xFFA0)
269 ch = full_width_map [ch - 0xFF60];
271 if (ch >= 0x2010 && ch <= 0x9FA5)
273 if (shifted_in_conv) {
274 bytes [byteIndex++] = 0x0F;
275 shifted_in_conv = false;
279 case ISO2022JPMode.JISX0208:
282 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0208);
285 // This range contains the bulk of the CJK set.
286 value = (ch - 0x2010) * 2;
287 value = ((int)(convert.cjkToJis[value])) |
288 (((int)(convert.cjkToJis[value + 1])) << 8);
289 } else if (ch >= 0xFF01 && ch <= 0xFF60) {
290 if (shifted_in_conv) {
291 bytes [byteIndex++] = 0x0F;
292 shifted_in_conv = false;
296 case ISO2022JPMode.JISX0208:
299 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0208);
303 // This range contains extra characters,
304 value = (ch - 0xFF01) * 2;
305 value = ((int)(convert.extraToJis[value])) |
306 (((int)(convert.extraToJis[value + 1])) << 8);
307 } else if (ch >= 0xFF60 && ch <= 0xFFA0) {
308 // disallowed half-width kana is
309 // already converted to full-width kana
310 // so here we don't have to consider it.
312 if (allow_shift_io) {
313 if (!shifted_in_conv) {
314 bytes [byteIndex++] = 0x0E;
315 shifted_in_conv = true;
320 case ISO2022JPMode.JISX0201:
323 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0201);
328 } else if (ch < 128) {
329 if (shifted_in_conv) {
330 bytes [byteIndex++] = 0x0F;
331 shifted_in_conv = false;
334 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.ASCII);
339 chars, ref i, ref charCount,
340 bytes, ref byteIndex, ref byteCount);
342 // skip non-convertible character
346 //Console.WriteLine ("{0:X04} : {1:x02} {2:x02}", v, (int) v / 94 + 33, v % 94 + 33);
349 bytes [byteIndex++] = (byte) (value / 94 + 33);
350 bytes [byteIndex++] = (byte) (value % 94 + 33);
354 bytes [byteIndex++] = (byte) value;
359 // must end in ASCII mode
360 if (shifted_in_conv) {
361 bytes [byteIndex++] = 0x0F;
362 shifted_in_conv = false;
365 if (m != ISO2022JPMode.ASCII)
366 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.ASCII);
368 return byteIndex - start;
372 public override void Reset ()
374 m = ISO2022JPMode.ASCII;
375 shifted_in_conv = shifted_in_count = false;
380 internal class ISO2022JPDecoder : Decoder
382 static JISConvert convert = JISConvert.Convert;
384 readonly bool allow_shift_io;
385 ISO2022JPMode m = ISO2022JPMode.ASCII;
386 bool shifted_in_conv, shifted_in_count;
388 public ISO2022JPDecoder (bool allow1ByteKana, bool allowShiftIO)
390 this.allow_shift_io = allowShiftIO;
394 public override int GetCharCount (byte [] bytes, int index, int count)
398 int end = index + count;
399 for (int i = index; i < end; i++) {
400 if (allow_shift_io) {
403 shifted_in_count = false;
406 shifted_in_count = true;
410 if (bytes [i] != 0x1B) {
411 if (!shifted_in_count && m == ISO2022JPMode.JISX0208) {
413 break; // incomplete head of wide char
419 ret++; // half-kana or ASCII
422 break; // incomplete escape sequence
425 if (bytes [i] == 0x24)
427 else if (bytes [i] == 0x28)
430 throw new ArgumentException ("Unexpected ISO-2022-JP escape sequence.");
432 if (bytes [i] == 0x42)
433 m = wide ? ISO2022JPMode.JISX0208 : ISO2022JPMode.ASCII;
434 else if (bytes [i] == 0x49)
435 m = ISO2022JPMode.JISX0201;
437 throw new ArgumentException (String.Format ("Unexpected ISO-2022-JP escape sequence. Ended with 0x{0:X04}", bytes [i]));
443 private int ToChar (int value)
446 return value + 1 >= convert.jisx0208ToUnicode.Length ?
448 ((int) (convert.jisx0208ToUnicode [value])) |
449 (((int) (convert.jisx0208ToUnicode [value + 1])) << 8);
452 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
454 int start = charIndex;
455 int end = byteIndex + byteCount;
456 for (int i = byteIndex; i < end && charIndex < chars.Length; i++) {
457 if (allow_shift_io) {
460 shifted_in_conv = false;
463 shifted_in_conv = true;
468 if (bytes [i] != 0x1B) {
469 if (shifted_in_conv || m == ISO2022JPMode.JISX0201) {
471 if (bytes [i] < 0x60)
472 chars [charIndex++] = (char) (bytes [i] + 0xFF40);
475 chars [charIndex++] = '?';
477 else if (m == ISO2022JPMode.JISX0208) {
479 break; // incomplete head of wide char
481 // am so lazy, so reusing jis2sjis
482 int s1 = ((bytes [i] - 1) >> 1) + ((bytes [i] <= 0x5e) ? 0x71 : 0xb1);
483 int s2 = bytes [i + 1] + (((bytes [i] & 1) != 0) ? 0x20 : 0x7e);
484 int v = (s1 - 0x81) * 0xBC;
489 chars [charIndex++] = '?';
491 chars [charIndex++] = (char) ch;
494 // LAMESPEC: actually this should not
495 // be allowed when 1byte-kana is not
496 // allowed, but MS.NET seems to allow
498 else if (bytes [i] > 0xA0 && bytes [i] < 0xE0) // half-width Katakana
499 chars [charIndex++] = (char) (bytes [i] - 0xA0 + 0xFF60);
501 chars [charIndex++] = (char) bytes [i];
505 break; // incomplete escape sequence
508 if (bytes [i] == 0x24)
510 else if (bytes [i] == 0x28)
513 throw new ArgumentException ("Unexpected ISO-2022-JP escape sequence.");
515 if (bytes [i] == 0x42)
516 m = wide ? ISO2022JPMode.JISX0208 : ISO2022JPMode.ASCII;
517 else if (bytes [i] == 0x49)
518 m = ISO2022JPMode.JISX0201;
520 throw new ArgumentException (String.Format ("Unexpected ISO-2022-JP escape sequence. Ended with 0x{0:X04}", bytes [i]));
524 return charIndex - start;
528 public override void Reset ()
530 m = ISO2022JPMode.ASCII;
531 shifted_in_count = shifted_in_conv = false;
537 public class ENCiso_2022_jp : CP50220
539 public ENCiso_2022_jp () : base() {}
541 }; // class ENCiso_2022_jp