5 // Atsushi Enomoto <atsushi@ximian.com>
12 using MonoEncoder = I18N.Common.MonoSafeEncoder;
13 using MonoEncoding = I18N.Common.MonoSafeEncoding;
19 public class CP50220 : ISO2022JPEncoding
22 : base (50220, false, false)
26 public override string EncodingName {
27 get { return "Japanese (JIS)"; }
32 public class CP50221 : ISO2022JPEncoding
35 : base (50221, true, false)
39 public override string EncodingName {
40 get { return "Japanese (JIS-Allow 1 byte Kana)"; }
45 public class CP50222 : ISO2022JPEncoding
48 : base (50222, true, true)
52 public override string EncodingName {
53 get { return "Japanese (JIS-Allow 1 byte Kana - SO/SI)"; }
58 public class ISO2022JPEncoding : MonoEncoding
60 public ISO2022JPEncoding (int codePage, bool allow1ByteKana, bool allowShiftIO)
61 : base (codePage, 932)
63 this.allow_1byte_kana = allow1ByteKana;
64 this.allow_shift_io = allowShiftIO;
67 readonly bool allow_1byte_kana, allow_shift_io;
69 public override string BodyName {
70 get { return "iso-2022-jp"; }
73 public override string HeaderName {
74 get { return "iso-2022-jp"; }
77 public override string WebName {
78 get { return "csISO2022JP"; }
81 public override int GetMaxByteCount (int charCount)
83 // ESC w ESC s ESC w ... (even number) ESC s
84 return charCount / 2 * 5 + 4;
87 public override int GetMaxCharCount (int byteCount)
94 protected override unsafe int GetBytesInternal(char* chars, int charCount, byte* bytes, int byteCount, bool flush, object state)
97 return ((ISO2022JPEncoder)state).GetBytesImpl (chars, charCount, bytes, byteCount, true);
99 return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetBytesImpl (chars, charCount, bytes, byteCount, true);
102 public unsafe override int GetByteCountImpl (char* chars, int count)
104 return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetByteCountImpl (chars, count, true);
107 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount)
109 return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetBytesImpl (chars, charCount, bytes, byteCount, true);
112 protected override int GetBytesInternal(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush, object state)
115 return ((ISO2022JPEncoder)state).GetBytesInternal(chars, charIndex, charCount, bytes, byteIndex, true);
117 return new ISO2022JPEncoder(this, allow_1byte_kana, allow_shift_io).GetBytesInternal(chars, charIndex, charCount, bytes, byteIndex, true);
120 public override int GetByteCount(char[] chars, int charIndex, int charCount)
122 return new ISO2022JPEncoder(this, allow_1byte_kana, allow_shift_io).GetByteCount(chars, charIndex, charCount, true);
125 public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)
127 return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetBytes(chars, charIndex, charCount, bytes, byteIndex, true);
131 public override int GetCharCount (byte [] bytes, int index, int count)
133 return new ISO2022JPDecoder (allow_1byte_kana, allow_shift_io).GetCharCount (bytes, index, count);
136 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
138 return new ISO2022JPDecoder (allow_1byte_kana, allow_shift_io).GetChars (bytes, byteIndex, byteCount, chars, charIndex);
142 internal enum ISO2022JPMode {
148 internal class ISO2022JPEncoder : MonoEncoder
150 static JISConvert convert = JISConvert.Convert;
152 readonly bool allow_1byte_kana, allow_shift_io;
154 ISO2022JPMode m = ISO2022JPMode.ASCII;
155 bool shifted_in_count, shifted_in_conv;
157 public ISO2022JPEncoder(MonoEncoding owner, bool allow1ByteKana, bool allowShiftIO)
160 this.allow_1byte_kana = allow1ByteKana;
161 this.allow_shift_io = allowShiftIO;
165 public unsafe override int GetByteCountImpl (char* chars, int charCount, bool flush)
167 return GetBytesImpl(chars, charCount, null, 0, flush);
170 public override int GetByteCount(char[] chars, int charIndex, int charCount, bool flush)
172 return GetBytesInternal (chars, charIndex, charCount, null, 0, true);
177 private unsafe bool IsShifted(byte *bytes)
179 return bytes == null ? shifted_in_count : shifted_in_conv;
182 private unsafe void SetShifted(byte *bytes, bool state)
185 shifted_in_count = state;
187 shifted_in_conv = state;
190 // returns false if it failed to add required ESC.
191 private unsafe void SwitchMode (byte* bytes, ref int byteIndex,
192 ref int byteCount, ref ISO2022JPMode cur, ISO2022JPMode next)
197 // If bytes == null we are just counting chars..
205 throw new ArgumentOutOfRangeException ("Insufficient byte buffer.");
207 bytes [byteIndex++] = 0x1B;
209 case ISO2022JPMode.JISX0201:
210 bytes [byteIndex++] = 0x28;
211 bytes [byteIndex++] = 0x49;
213 case ISO2022JPMode.JISX0208:
214 bytes [byteIndex++] = 0x24;
215 bytes [byteIndex++] = 0x42;
218 bytes [byteIndex++] = 0x28;
219 bytes [byteIndex++] = 0x42;
225 private bool IsShifted(byte[] bytes)
227 return bytes == null ? shifted_in_count : shifted_in_conv;
230 private void SetShifted(byte[] bytes, bool state)
233 shifted_in_count = state;
235 shifted_in_conv = state;
238 private void SwitchMode(byte[] bytes, ref int byteIndex,
239 ref int byteCount, ref ISO2022JPMode cur, ISO2022JPMode next)
244 // If bytes == null we are just counting chars..
253 throw new ArgumentOutOfRangeException("Insufficient byte buffer.");
255 bytes[byteIndex++] = 0x1B;
258 case ISO2022JPMode.JISX0201:
259 bytes[byteIndex++] = 0x28;
260 bytes[byteIndex++] = 0x49;
262 case ISO2022JPMode.JISX0208:
263 bytes[byteIndex++] = 0x24;
264 bytes[byteIndex++] = 0x42;
267 bytes[byteIndex++] = 0x28;
268 bytes[byteIndex++] = 0x42;
276 static readonly char [] full_width_map = new char [] {
277 '\0', '\u3002', '\u300C', '\u300D', '\u3001', '\u30FB', // to nakaguro
278 '\u30F2', '\u30A1', '\u30A3', '\u30A5', '\u30A7', '\u30A9', '\u30E3', '\u30E5', '\u30E7', '\u30C3', // to small tsu
279 '\u30FC', '\u30A2', '\u30A4', '\u30A6', '\u30A8', '\u30AA', // A-O
280 '\u30AB', '\u30AD', '\u30AF', '\u30B1', '\u30B3',
281 '\u30B5', '\u30B7', '\u30B9', '\u30BB', '\u30BD',
282 '\u30BF', '\u30C1', '\u30C4', '\u30C6', '\u30C8',
283 '\u30CA', '\u30CB', '\u30CC', '\u30CD', '\u30CE',
284 '\u30CF', '\u30D2', '\u30D5', '\u30D8', '\u30DB',
285 '\u30DE', '\u30DF', '\u30E0', '\u30E1', '\u30E2',
286 '\u30E4', '\u30E6', '\u30E8', // Ya-Yo
287 '\u30E9', '\u30EA', '\u30EB', '\u30EC', '\u30ED',
288 '\u30EF', '\u30F3', '\u309B', '\u309C' };
291 public unsafe override int GetBytesImpl (
292 char* chars, int charCount,
293 byte* bytes, int byteCount, bool flush)
298 int start = byteIndex;
299 int end = charIndex + charCount;
302 for (int i = charIndex; i < end; i++, charCount--) {
305 // When half-kana is not allowed and it is
306 // actually in the input, convert to full width
308 if (!allow_1byte_kana &&
309 ch >= 0xFF60 && ch <= 0xFFA0)
310 ch = full_width_map [ch - 0xFF60];
312 if (ch >= 0x2010 && ch <= 0x9FA5)
314 if (IsShifted(bytes)) {
315 var offset = byteIndex++;
316 if (bytes != null) bytes [offset] = 0x0F;
317 SetShifted(bytes, false);
321 case ISO2022JPMode.JISX0208:
324 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0208);
327 // This range contains the bulk of the CJK set.
328 value = (ch - 0x2010) * 2;
329 value = ((int)(convert.cjkToJis[value])) |
330 (((int)(convert.cjkToJis[value + 1])) << 8);
331 } else if (ch >= 0xFF01 && ch <= 0xFF60) {
332 if (IsShifted(bytes)) {
333 var offset = byteIndex++;
334 if (bytes != null) bytes [offset] = 0x0F;
335 SetShifted(bytes, false);
339 case ISO2022JPMode.JISX0208:
342 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0208);
346 // This range contains extra characters,
347 value = (ch - 0xFF01) * 2;
348 value = ((int)(convert.extraToJis[value])) |
349 (((int)(convert.extraToJis[value + 1])) << 8);
350 } else if (ch >= 0xFF60 && ch <= 0xFFA0) {
351 // disallowed half-width kana is
352 // already converted to full-width kana
353 // so here we don't have to consider it.
355 if (allow_shift_io) {
356 if (!IsShifted(bytes)) {
357 var offset = byteIndex++;
358 if (bytes != null) bytes [offset] = 0x0E;
359 SetShifted(bytes, true);
364 case ISO2022JPMode.JISX0201:
367 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0201);
372 } else if (ch < 128) {
373 if (IsShifted(bytes)) {
374 var offset = byteIndex++;
375 if (bytes != null) bytes [offset] = 0x0F;
376 SetShifted(bytes, false);
379 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.ASCII);
384 chars, ref i, ref charCount,
385 bytes, ref byteIndex, ref byteCount, this);
387 // skip non-convertible character
391 //Console.WriteLine ("{0:X04} : {1:x02} {2:x02}", v, (int) v / 94 + 33, v % 94 + 33);
392 if (value >= 0x100) {
395 bytes [byteIndex++] = (byte) (value / 94 + 33);
396 bytes [byteIndex++] = (byte) (value % 94 + 33);
403 var offset = byteIndex++;
404 if (bytes != null) bytes [offset] = (byte) value;
409 // must end in ASCII mode
410 if (IsShifted(bytes)) {
411 var offset = byteIndex++;
412 if (bytes != null) bytes [offset] = 0x0F;
413 SetShifted(bytes, false);
416 if (m != ISO2022JPMode.ASCII)
417 SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.ASCII);
419 return byteIndex - start;
422 internal int GetBytesInternal(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush)
424 int start = byteIndex;
425 int end = charIndex + charCount;
427 int byteCount = bytes != null ? bytes.Length : 0;
429 for (int i = charIndex; i < end; i++, charCount--)
433 // When half-kana is not allowed and it is
434 // actually in the input, convert to full width
436 if (!allow_1byte_kana &&
437 ch >= 0xFF60 && ch <= 0xFFA0)
438 ch = full_width_map[ch - 0xFF60];
440 if (ch >= 0x2010 && ch <= 0x9FA5)
442 if (IsShifted (bytes))
444 var offset = byteIndex++;
445 if (bytes != null) bytes[offset] = 0x0F;
446 SetShifted (bytes, false);
451 case ISO2022JPMode.JISX0208:
454 SwitchMode(bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0208);
457 // This range contains the bulk of the CJK set.
458 value = (ch - 0x2010) * 2;
459 value = ((int)(convert.cjkToJis[value])) |
460 (((int)(convert.cjkToJis[value + 1])) << 8);
462 else if (ch >= 0xFF01 && ch <= 0xFF60)
464 if (IsShifted(bytes))
466 var offset = byteIndex++;
467 if (bytes != null) bytes[offset] = 0x0F;
468 SetShifted (bytes, false);
473 case ISO2022JPMode.JISX0208:
476 SwitchMode(bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0208);
480 // This range contains extra characters,
481 value = (ch - 0xFF01) * 2;
482 value = ((int)(convert.extraToJis[value])) |
483 (((int)(convert.extraToJis[value + 1])) << 8);
485 else if (ch >= 0xFF60 && ch <= 0xFFA0)
487 // disallowed half-width kana is
488 // already converted to full-width kana
489 // so here we don't have to consider it.
493 if (!IsShifted (bytes))
495 var offset = byteIndex++;
496 if (bytes != null) bytes[offset] = 0x0E;
497 SetShifted (bytes, true);
505 case ISO2022JPMode.JISX0201:
508 SwitchMode(bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0201);
516 if (IsShifted (bytes))
518 var offset = byteIndex++;
519 if (bytes != null) bytes[offset] = 0x0F;
520 SetShifted (bytes, false);
523 SwitchMode(bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.ASCII);
529 HandleFallback (chars, ref i, ref charCount,
530 bytes, ref byteIndex, ref byteCount, this);
532 // skip non-convertible character
536 //Console.WriteLine ("{0:X04} : {1:x02} {2:x02}", v, (int) v / 94 + 33, v % 94 + 33);
542 bytes[byteIndex++] = (byte)(value / 94 + 33);
543 bytes[byteIndex++] = (byte)(value % 94 + 33);
553 var offset = byteIndex++;
554 if (bytes != null) bytes[offset] = (byte)value;
560 // must end in ASCII mode
561 if (IsShifted (bytes))
563 var offset = byteIndex++;
564 if (bytes != null) bytes[offset] = 0x0F;
565 SetShifted (bytes, false);
568 if (m != ISO2022JPMode.ASCII)
569 SwitchMode(bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.ASCII);
572 return byteIndex - start;
575 public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush)
577 return GetBytesInternal (chars, charIndex, charCount, bytes, byteIndex, flush);
582 public override void Reset ()
584 m = ISO2022JPMode.ASCII;
585 shifted_in_conv = shifted_in_count = false;
591 internal class ISO2022JPDecoder : Decoder
593 static JISConvert convert = JISConvert.Convert;
595 readonly bool allow_shift_io;
596 ISO2022JPMode m = ISO2022JPMode.ASCII;
597 bool shifted_in_conv, shifted_in_count;
599 public ISO2022JPDecoder (bool allow1ByteKana, bool allowShiftIO)
601 this.allow_shift_io = allowShiftIO;
605 public override int GetCharCount (byte [] bytes, int index, int count)
609 int end = index + count;
610 for (int i = index; i < end; i++) {
611 if (allow_shift_io) {
614 shifted_in_count = false;
617 shifted_in_count = true;
621 if (bytes [i] != 0x1B) {
622 if (!shifted_in_count && m == ISO2022JPMode.JISX0208) {
624 break; // incomplete head of wide char
630 ret++; // half-kana or ASCII
633 break; // incomplete escape sequence
636 if (bytes [i] == 0x24)
638 else if (bytes [i] == 0x28)
645 if (bytes [i] == 0x42)
646 m = wide ? ISO2022JPMode.JISX0208 : ISO2022JPMode.ASCII;
647 else if (bytes [i] == 0x4A) // obsoleted
648 m = ISO2022JPMode.ASCII;
649 else if (bytes [i] == 0x49)
650 m = ISO2022JPMode.JISX0201;
658 private int ToChar (int value)
661 return value + 1 >= convert.jisx0208ToUnicode.Length || value < 0 ?
663 ((int) (convert.jisx0208ToUnicode [value])) |
664 (((int) (convert.jisx0208ToUnicode [value + 1])) << 8);
667 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
669 int start = charIndex;
670 int end = byteIndex + byteCount;
671 for (int i = byteIndex; i < end && charIndex < chars.Length; i++) {
672 if (allow_shift_io) {
675 shifted_in_conv = false;
678 shifted_in_conv = true;
683 if (bytes [i] != 0x1B) {
684 if (shifted_in_conv || m == ISO2022JPMode.JISX0201) {
686 if (bytes [i] < 0x60)
687 chars [charIndex++] = (char) (bytes [i] + 0xFF40);
690 chars [charIndex++] = '?';
692 else if (m == ISO2022JPMode.JISX0208) {
694 break; // incomplete head of wide char
696 // am so lazy, so reusing jis2sjis
697 int s1 = ((bytes [i] - 1) >> 1) + ((bytes [i] <= 0x5e) ? 0x71 : 0xb1);
698 int s2 = bytes [i + 1] + (((bytes [i] & 1) != 0) ? 0x20 : 0x7e);
699 int v = (s1 - 0x81) * 0xBC;
704 chars [charIndex++] = '?';
706 chars [charIndex++] = (char) ch;
709 // LAMESPEC: actually this should not
710 // be allowed when 1byte-kana is not
711 // allowed, but MS.NET seems to allow
713 else if (bytes [i] > 0xA0 && bytes [i] < 0xE0) // half-width Katakana
714 chars [charIndex++] = (char) (bytes [i] - 0xA0 + 0xFF60);
716 chars [charIndex++] = (char) bytes [i];
720 break; // incomplete escape sequence
723 if (bytes [i] == 0x24)
725 else if (bytes [i] == 0x28)
728 chars [charIndex++] = '\x1B';
729 chars [charIndex++] = (char) bytes [i];
733 if (bytes [i] == 0x42)
734 m = wide ? ISO2022JPMode.JISX0208 : ISO2022JPMode.ASCII;
735 else if (bytes [i] == 0x4A) // obsoleted
736 m = ISO2022JPMode.ASCII;
737 else if (bytes [i] == 0x49)
738 m = ISO2022JPMode.JISX0201;
740 chars [charIndex++] = '\x1B';
741 chars [charIndex++] = (char) bytes [i - 1];
742 chars [charIndex++] = (char) bytes [i];
747 return charIndex - start;
751 public override void Reset ()
753 m = ISO2022JPMode.ASCII;
754 shifted_in_count = shifted_in_conv = false;
760 public class ENCiso_2022_jp : CP50220
762 public ENCiso_2022_jp () : base() {}
764 }; // class ENCiso_2022_jp