5 // Atsushi Enomoto <atsushi@ximian.com>
12 using MonoEncoder = I18N.Common.MonoSafeEncoder;
13 using MonoEncoding = I18N.Common.MonoSafeEncoding;
19 public class CP50220 : ISO2022JPEncoding
22 : base (50220, false, false)
26 public override string EncodingName {
27 get { return "Japanese (JIS)"; }
32 public class CP50221 : ISO2022JPEncoding
35 : base (50221, true, false)
39 public override string EncodingName {
40 get { return "Japanese (JIS-Allow 1 byte Kana)"; }
45 public class CP50222 : ISO2022JPEncoding
48 : base (50222, true, true)
52 public override string EncodingName {
53 get { return "Japanese (JIS-Allow 1 byte Kana - SO/SI)"; }
58 public class ISO2022JPEncoding : MonoEncoding
60 public ISO2022JPEncoding (int codePage, bool allow1ByteKana, bool allowShiftIO)
61 : base (codePage, 932)
63 this.allow_1byte_kana = allow1ByteKana;
64 this.allow_shift_io = allowShiftIO;
67 readonly bool allow_1byte_kana, allow_shift_io;
69 public override string BodyName {
70 get { return "iso-2022-jp"; }
73 public override string HeaderName {
74 get { return "iso-2022-jp"; }
77 public override string WebName {
78 get { return "csISO2022JP"; }
81 public override int GetMaxByteCount (int charCount)
83 // ESC w ESC s ESC w ... (even number) ESC s
84 return charCount / 2 * 5 + 4;
87 public override int GetMaxCharCount (int byteCount)
94 protected override unsafe int GetBytesInternal(char* chars, int charCount, byte* bytes, int byteCount, bool flush, object state)
97 return ((ISO2022JPEncoder)state).GetBytesImpl (chars, charCount, bytes, byteCount, true);
99 return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetBytesImpl (chars, charCount, bytes, byteCount, true);
102 public unsafe override int GetByteCountImpl (char* chars, int count)
104 return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetByteCountImpl (chars, count, true);
107 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount)
109 return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetBytesImpl (chars, charCount, bytes, byteCount, true);
112 protected override int GetBytesInternal(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush, object state)
115 return ((ISO2022JPEncoder)state).GetBytesInternal(chars, charIndex, charCount, bytes, byteIndex, true);
117 return new ISO2022JPEncoder(this, allow_1byte_kana, allow_shift_io).GetBytesInternal(chars, charIndex, charCount, bytes, byteIndex, true);
120 public override int GetByteCount(char[] chars, int charIndex, int charCount)
122 return new ISO2022JPEncoder(this, allow_1byte_kana, allow_shift_io).GetByteCount(chars, charIndex, charCount, true);
125 public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)
127 return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetBytes(chars, charIndex, charCount, bytes, byteIndex, true);
131 public override int GetCharCount (byte [] bytes, int index, int count)
133 return new ISO2022JPDecoder (allow_1byte_kana, allow_shift_io).GetCharCount (bytes, index, count);
136 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
138 return new ISO2022JPDecoder (allow_1byte_kana, allow_shift_io).GetChars (bytes, byteIndex, byteCount, chars, charIndex);
142 internal enum ISO2022JPMode {
148 internal class ISO2022JPEncoder : MonoEncoder
150 static JISConvert convert = JISConvert.Convert;
152 readonly bool allow_1byte_kana, allow_shift_io;
154 ISO2022JPMode m = ISO2022JPMode.ASCII;
155 bool shifted_in_count, shifted_in_conv;
157 public ISO2022JPEncoder(MonoEncoding owner, bool allow1ByteKana, bool allowShiftIO)
160 this.allow_1byte_kana = allow1ByteKana;
161 this.allow_shift_io = allowShiftIO;
165 public unsafe override int GetByteCountImpl (char* chars, int charCount, bool flush)
167 return GetBytesImpl(chars, charCount, null, 0, flush);
170 public override int GetByteCount(char[] chars, int charIndex, int charCount, bool flush)
172 return GetBytesInternal (chars, charIndex, charCount, null, 0, true);
177 private unsafe bool IsShifted(byte *bytes)
179 return bytes == null ? shifted_in_count : shifted_in_conv;
182 private unsafe void SetShifted(byte *bytes, bool state)
185 shifted_in_count = state;
187 shifted_in_conv = state;
190 // returns false if it failed to add required ESC.
191 private unsafe void SwitchMode (byte* bytes, ref int byteIndex,
192 ref int byteCount, ref ISO2022JPMode cur, ISO2022JPMode next)
197 // If bytes == null we are just counting chars..
205 throw new ArgumentOutOfRangeException ("Insufficient byte buffer.");
207 bytes [byteIndex++] = 0x1B;
209 case ISO2022JPMode.JISX0201:
210 bytes [byteIndex++] = 0x28;
211 bytes [byteIndex++] = 0x49;
213 case ISO2022JPMode.JISX0208:
214 bytes [byteIndex++] = 0x24;
215 bytes [byteIndex++] = 0x42;
218 bytes [byteIndex++] = 0x28;
219 bytes [byteIndex++] = 0x42;
225 private bool IsShifted(byte[] bytes)
227 return bytes == null ? shifted_in_count : shifted_in_conv;
230 private void SetShifted(byte[] bytes, bool state)
233 shifted_in_count = state;
235 shifted_in_conv = state;
238 private void SwitchMode(byte[] bytes, ref int byteIndex,
239 ref int byteCount, ref ISO2022JPMode cur, ISO2022JPMode next)
244 // If bytes == null we are just counting chars..
253 throw new ArgumentOutOfRangeException("Insufficient byte buffer.");
255 bytes[byteIndex++] = 0x1B;
258 case ISO2022JPMode.JISX0201:
259 bytes[byteIndex++] = 0x28;
260 bytes[byteIndex++] = 0x49;
262 case ISO2022JPMode.JISX0208:
263 bytes[byteIndex++] = 0x24;
264 bytes[byteIndex++] = 0x42;
267 bytes[byteIndex++] = 0x28;
268 bytes[byteIndex++] = 0x42;
276 static readonly char [] full_width_map = new char [] {
277 '\0', '\u3002', '\u300C', '\u300D', '\u3001', '\u30FB', // to nakaguro
278 '\u30F2', '\u30A1', '\u30A3', '\u30A5', '\u30A7', '\u30A9', '\u30E3', '\u30E5', '\u30E7', '\u30C3', // to small tsu
279 '\u30FC', '\u30A2', '\u30A4', '\u30A6', '\u30A8', '\u30AA', // A-O
280 '\u30AB', '\u30AD', '\u30AF', '\u30B1', '\u30B3',
281 '\u30B5', '\u30B7', '\u30B9', '\u30BB', '\u30BD',
282 '\u30BF', '\u30C1', '\u30C4', '\u30C6', '\u30C8',
283 '\u30CA', '\u30CB', '\u30CC', '\u30CD', '\u30CE',
284 '\u30CF', '\u30D2', '\u30D5', '\u30D8', '\u30DB',
285 '\u30DE', '\u30DF', '\u30E0', '\u30E1', '\u30E2',
286 '\u30E4', '\u30E6', '\u30E8', // Ya-Yo
287 '\u30E9', '\u30EA', '\u30EB', '\u30EC', '\u30ED',
288 '\u30EF', '\u30F3', '\u309B', '\u309C' };
291 public unsafe override int GetBytesImpl (
292 char* chars, int charCount,
293 byte* bytes, int byteCount, bool flush)
298 int start = byteIndex;
299 int end = charIndex + charCount;
302 for (int i = charIndex; i < end; i++, charCount--) {
305 // When half-kana is not allowed and it is
306 // actually in the input, convert to full width
308 if (!allow_1byte_kana &&
309 ch >= 0xFF60 && ch <= 0xFFA0)
310 ch = full_width_map [ch - 0xFF60];
312 if (ch >= 0x2010 && ch <= 0x9FA5)
314 if (IsShifted(bytes)) {
315 var offset = byteIndex++;
316 if (bytes != null) bytes [offset] = 0x0F;
317 SetShifted(bytes, false);
321 case ISO2022JPMode.JISX0208:
324 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0208);
327 // This range contains the bulk of the CJK set.
328 value = (ch - 0x2010) * 2;
329 value = ((int)(convert.cjkToJis[value])) |
330 (((int)(convert.cjkToJis[value + 1])) << 8);
331 } else if (ch >= 0xFF01 && ch <= 0xFF60) {
332 if (IsShifted(bytes)) {
333 var offset = byteIndex++;
334 if (bytes != null) bytes [offset] = 0x0F;
335 SetShifted(bytes, false);
339 case ISO2022JPMode.JISX0208:
342 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0208);
346 // This range contains extra characters,
347 value = (ch - 0xFF01) * 2;
348 value = ((int)(convert.extraToJis[value])) |
349 (((int)(convert.extraToJis[value + 1])) << 8);
350 } else if (ch >= 0xFF60 && ch <= 0xFFA0) {
351 // disallowed half-width kana is
352 // already converted to full-width kana
353 // so here we don't have to consider it.
355 if (allow_shift_io) {
356 if (!IsShifted(bytes)) {
357 var offset = byteIndex++;
358 if (bytes != null) bytes [offset] = 0x0E;
359 SetShifted(bytes, true);
364 case ISO2022JPMode.JISX0201:
367 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0201);
372 } else if (ch < 128) {
373 if (IsShifted(bytes)) {
374 var offset = byteIndex++;
375 if (bytes != null) bytes [offset] = 0x0F;
376 SetShifted(bytes, false);
379 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.ASCII);
383 chars, ref i, ref charCount,
384 bytes, ref byteIndex, ref byteCount, this);
385 // skip non-convertible character
389 //Console.WriteLine ("{0:X04} : {1:x02} {2:x02}", v, (int) v / 94 + 33, v % 94 + 33);
390 if (value >= 0x100) {
393 bytes [byteIndex++] = (byte) (value / 94 + 33);
394 bytes [byteIndex++] = (byte) (value % 94 + 33);
401 var offset = byteIndex++;
402 if (bytes != null) bytes [offset] = (byte) value;
407 // must end in ASCII mode
408 if (IsShifted(bytes)) {
409 var offset = byteIndex++;
410 if (bytes != null) bytes [offset] = 0x0F;
411 SetShifted(bytes, false);
414 if (m != ISO2022JPMode.ASCII)
415 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.ASCII);
417 return byteIndex - start;
420 internal int GetBytesInternal(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush)
422 int start = byteIndex;
423 int end = charIndex + charCount;
425 int byteCount = bytes != null ? bytes.Length : 0;
427 for (int i = charIndex; i < end; i++, charCount--)
431 // When half-kana is not allowed and it is
432 // actually in the input, convert to full width
434 if (!allow_1byte_kana &&
435 ch >= 0xFF60 && ch <= 0xFFA0)
436 ch = full_width_map[ch - 0xFF60];
438 if (ch >= 0x2010 && ch <= 0x9FA5)
440 if (IsShifted (bytes))
442 var offset = byteIndex++;
443 if (bytes != null) bytes[offset] = 0x0F;
444 SetShifted (bytes, false);
449 case ISO2022JPMode.JISX0208:
452 SwitchMode(bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0208);
455 // This range contains the bulk of the CJK set.
456 value = (ch - 0x2010) * 2;
457 value = ((int)(convert.cjkToJis[value])) |
458 (((int)(convert.cjkToJis[value + 1])) << 8);
460 else if (ch >= 0xFF01 && ch <= 0xFF60)
462 if (IsShifted(bytes))
464 var offset = byteIndex++;
465 if (bytes != null) bytes[offset] = 0x0F;
466 SetShifted (bytes, false);
471 case ISO2022JPMode.JISX0208:
474 SwitchMode(bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0208);
478 // This range contains extra characters,
479 value = (ch - 0xFF01) * 2;
480 value = ((int)(convert.extraToJis[value])) |
481 (((int)(convert.extraToJis[value + 1])) << 8);
483 else if (ch >= 0xFF60 && ch <= 0xFFA0)
485 // disallowed half-width kana is
486 // already converted to full-width kana
487 // so here we don't have to consider it.
491 if (!IsShifted (bytes))
493 var offset = byteIndex++;
494 if (bytes != null) bytes[offset] = 0x0E;
495 SetShifted (bytes, true);
503 case ISO2022JPMode.JISX0201:
506 SwitchMode(bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0201);
514 if (IsShifted (bytes))
516 var offset = byteIndex++;
517 if (bytes != null) bytes[offset] = 0x0F;
518 SetShifted (bytes, false);
521 SwitchMode(bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.ASCII);
526 HandleFallback (chars, ref i, ref charCount,
527 bytes, ref byteIndex, ref byteCount, this);
528 // skip non-convertible character
532 //Console.WriteLine ("{0:X04} : {1:x02} {2:x02}", v, (int) v / 94 + 33, v % 94 + 33);
538 bytes[byteIndex++] = (byte)(value / 94 + 33);
539 bytes[byteIndex++] = (byte)(value % 94 + 33);
549 var offset = byteIndex++;
550 if (bytes != null) bytes[offset] = (byte)value;
556 // must end in ASCII mode
557 if (IsShifted (bytes))
559 var offset = byteIndex++;
560 if (bytes != null) bytes[offset] = 0x0F;
561 SetShifted (bytes, false);
564 if (m != ISO2022JPMode.ASCII)
565 SwitchMode(bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.ASCII);
568 return byteIndex - start;
571 public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush)
573 return GetBytesInternal (chars, charIndex, charCount, bytes, byteIndex, flush);
577 public override void Reset ()
579 m = ISO2022JPMode.ASCII;
580 shifted_in_conv = shifted_in_count = false;
585 internal class ISO2022JPDecoder : Decoder
587 static JISConvert convert = JISConvert.Convert;
589 readonly bool allow_shift_io;
590 ISO2022JPMode m = ISO2022JPMode.ASCII;
591 bool shifted_in_conv, shifted_in_count;
593 public ISO2022JPDecoder (bool allow1ByteKana, bool allowShiftIO)
595 this.allow_shift_io = allowShiftIO;
599 public override int GetCharCount (byte [] bytes, int index, int count)
603 int end = index + count;
604 for (int i = index; i < end; i++) {
605 if (allow_shift_io) {
608 shifted_in_count = false;
611 shifted_in_count = true;
615 if (bytes [i] != 0x1B) {
616 if (!shifted_in_count && m == ISO2022JPMode.JISX0208) {
618 break; // incomplete head of wide char
624 ret++; // half-kana or ASCII
627 break; // incomplete escape sequence
630 if (bytes [i] == 0x24)
632 else if (bytes [i] == 0x28)
639 if (bytes [i] == 0x42 || bytes [i] == 0x40)
640 m = wide ? ISO2022JPMode.JISX0208 : ISO2022JPMode.ASCII;
641 else if (bytes [i] == 0x4A) // obsoleted
642 m = ISO2022JPMode.ASCII;
643 else if (bytes [i] == 0x49)
644 m = ISO2022JPMode.JISX0201;
652 private int ToChar (int value)
655 return value + 1 >= convert.jisx0208ToUnicode.Length || value < 0 ?
657 ((int) (convert.jisx0208ToUnicode [value])) |
658 (((int) (convert.jisx0208ToUnicode [value + 1])) << 8);
661 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
663 int start = charIndex;
664 int end = byteIndex + byteCount;
665 for (int i = byteIndex; i < end && charIndex < chars.Length; i++) {
666 if (allow_shift_io) {
669 shifted_in_conv = false;
672 shifted_in_conv = true;
677 if (bytes [i] != 0x1B) {
678 if (shifted_in_conv || m == ISO2022JPMode.JISX0201) {
680 if (bytes [i] < 0x60)
681 chars [charIndex++] = (char) (bytes [i] + 0xFF40);
684 chars [charIndex++] = '?';
686 else if (m == ISO2022JPMode.JISX0208) {
688 break; // incomplete head of wide char
690 // am so lazy, so reusing jis2sjis
691 int s1 = ((bytes [i] - 1) >> 1) + ((bytes [i] <= 0x5e) ? 0x71 : 0xb1);
692 int s2 = bytes [i + 1] + (((bytes [i] & 1) != 0) ? 0x20 : 0x7e);
693 int v = (s1 - 0x81) * 0xBC;
698 chars [charIndex++] = '?';
700 chars [charIndex++] = (char) ch;
703 // LAMESPEC: actually this should not
704 // be allowed when 1byte-kana is not
705 // allowed, but MS.NET seems to allow
707 else if (bytes [i] > 0xA0 && bytes [i] < 0xE0) // half-width Katakana
708 chars [charIndex++] = (char) (bytes [i] - 0xA0 + 0xFF60);
710 chars [charIndex++] = (char) bytes [i];
714 break; // incomplete escape sequence
717 if (bytes [i] == 0x24)
719 else if (bytes [i] == 0x28)
722 chars [charIndex++] = '\x1B';
723 chars [charIndex++] = (char) bytes [i];
727 if (bytes [i] == 0x42 || bytes [i] == 0x40)
728 m = wide ? ISO2022JPMode.JISX0208 : ISO2022JPMode.ASCII;
729 else if (bytes [i] == 0x4A) // obsoleted
730 m = ISO2022JPMode.ASCII;
731 else if (bytes [i] == 0x49)
732 m = ISO2022JPMode.JISX0201;
734 chars [charIndex++] = '\x1B';
735 chars [charIndex++] = (char) bytes [i - 1];
736 chars [charIndex++] = (char) bytes [i];
741 return charIndex - start;
744 public override void Reset ()
746 m = ISO2022JPMode.ASCII;
747 shifted_in_count = shifted_in_conv = false;
752 public class ENCiso_2022_jp : CP50220
754 public ENCiso_2022_jp () : base() {}
756 }; // class ENCiso_2022_jp