using System; using System.Text; using I18N.Common; namespace I18N.CJK { // FIXME: // find out what is the difference between 50220, 50221 and 50222. public class CP50220 : ISO2022JPEncoding { public CP50220 () : base (50220, false, false) { } public override string EncodingName { get { return "Japanese (JIS)"; } } } public class CP50221 : ISO2022JPEncoding { public CP50221 () : base (50221, true, false) { } public override string EncodingName { get { return "Japanese (JIS-Allow 1 byte Kana)"; } } } public class CP50222 : ISO2022JPEncoding { public CP50222 () : base (50222, true, true) { } public override string EncodingName { get { return "Japanese (JIS-Allow 1 byte Kana - SO/SI)"; } } } public class ISO2022JPEncoding : MonoEncoding { static JISConvert convert = JISConvert.Convert; public ISO2022JPEncoding (int codePage, bool allow1ByteKana, bool allowShiftIO) : base (codePage) { this.allow_1byte_kana = allow1ByteKana; this.allow_shift_io = allowShiftIO; } readonly bool allow_1byte_kana, allow_shift_io; public override string BodyName { get { return "iso-2022-jp"; } } public override string HeaderName { get { return "iso-2022-jp"; } } public override string WebName { get { return "csISO2022JP"; } } public override int GetMaxByteCount (int charCount) { // ESC w ESC s ESC w ... (even number) ESC s return charCount / 2 * 5 + 4; } public override int GetMaxCharCount (int byteCount) { // no escape sequence return byteCount; } public override int GetByteCount (char [] chars, int charIndex, int charCount) { return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetByteCount (chars, charIndex, charCount, true); } public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount) { return new ISO2022JPEncoder (this, allow_1byte_kana, allow_shift_io).GetBytesImpl (chars, charCount, bytes, byteCount, true); } public override int GetCharCount (byte [] bytes, int index, int count) { return new ISO2022JPDecoder (allow_1byte_kana, allow_shift_io).GetCharCount (bytes, index, count); } public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex) { return new ISO2022JPDecoder (allow_1byte_kana, allow_shift_io).GetChars (bytes, byteIndex, byteCount, chars, charIndex); } } internal enum ISO2022JPMode { ASCII, JISX0208, JISX0201 } internal class ISO2022JPEncoder : MonoEncoding.MonoEncoder { static JISConvert convert = JISConvert.Convert; readonly bool allow_1byte_kana, allow_shift_io; ISO2022JPMode m = ISO2022JPMode.ASCII; bool shifted_in; public ISO2022JPEncoder (MonoEncoding owner, bool allow1ByteKana, bool allowShiftIO) : base (owner) { this.allow_1byte_kana = allow1ByteKana; this.allow_shift_io = allowShiftIO; } public override int GetByteCount (char [] chars, int charIndex, int charCount, bool flush) { int end = charIndex + charCount; int value; int byteCount = 0; for (int i = charIndex; i < end; i++) { char ch = chars [i]; // When half-kana is not allowed and it is // actually in the input, convert to full width // kana. if (!allow_1byte_kana && ch >= 0xFF60 && ch <= 0xFFA0) ch = full_width_map [ch - 0xFF60]; if (ch >= 0x2010 && ch <= 0x9FA5) { if (shifted_in) { shifted_in = false; byteCount++; // shift_out } if (m != ISO2022JPMode.JISX0208) byteCount += 3; m = ISO2022JPMode.JISX0208; // This range contains the bulk of the CJK set. value = (ch - 0x2010) * 2; value = ((int)(convert.cjkToJis[value])) | (((int)(convert.cjkToJis[value + 1])) << 8); } else if (ch >= 0xFF01 && ch <= 0xFF60) { if (shifted_in) { shifted_in = false; byteCount++; } if (m != ISO2022JPMode.JISX0208) byteCount += 3; m = ISO2022JPMode.JISX0208; // This range contains extra characters, value = (ch - 0xFF01) * 2; value = ((int)(convert.extraToJis[value])) | (((int)(convert.extraToJis[value + 1])) << 8); } else if(ch >= 0xFF60 && ch <= 0xFFA0) { if (allow_shift_io) { if (!shifted_in) { byteCount++; shifted_in = true; } } else if (m != ISO2022JPMode.JISX0201) { byteCount += 3; m = ISO2022JPMode.JISX0201; } value = ch - 0xFF60 + 0xA0; } else if (ch < 128) { if (shifted_in) { shifted_in = false; byteCount++; } if (m != ISO2022JPMode.ASCII) byteCount += 3; m = ISO2022JPMode.ASCII; value = (int) ch; } else // skip non-convertible character continue; if (value > 0x100) byteCount += 2; else byteCount++; } // must end in ASCII mode if (flush) { if (shifted_in) { shifted_in = false; byteCount++; } if (m != ISO2022JPMode.ASCII) byteCount += 3; m = ISO2022JPMode.ASCII; } return byteCount; } // returns false if it failed to add required ESC. private unsafe void SwitchMode (byte* bytes, ref int byteIndex, ref int byteCount, ref ISO2022JPMode cur, ISO2022JPMode next) { if (cur == next) return; if (byteCount <= 3) throw new ArgumentOutOfRangeException ("Insufficient byte buffer."); bytes [byteIndex++] = 0x1B; bytes [byteIndex++] = (byte) (next == ISO2022JPMode.JISX0208 ? 0x24 : 0x28); bytes [byteIndex++] = (byte) (next == ISO2022JPMode.JISX0201 ? 0x49 : 0x42); cur = next; } static readonly char [] full_width_map = new char [] { '\0', '\u3002', '\u300C', '\u300D', '\u3001', '\u30FB', // to nakaguro '\u30F2', '\u30A1', '\u30A3', '\u30A5', '\u30A7', '\u30A9', '\u30E3', '\u30E5', '\u30E7', '\u30C3', // to small tsu '\u30FC', '\u30A2', '\u30A4', '\u30A6', '\u30A8', '\u30AA', // A-O '\u30AB', '\u30AD', '\u30AF', '\u30B1', '\u30B3', '\u30B5', '\u30B7', '\u30B9', '\u30BB', '\u30BD', '\u30BF', '\u30C1', '\u30C4', '\u30C6', '\u30C8', '\u30C9', '\u30CA', '\u30CB', '\u30CC', '\u30CD', '\u30CF', '\u30D2', '\u30D5', '\u30D8', '\u30DB', '\u30DE', '\u30DF', '\u30E0', '\u30E1', '\u30E2', '\u30E4', '\u30E6', '\u30E8', // Ya-Yo '\u30E9', '\u30EA', '\u30EB', '\u30EC', '\u30ED', '\u30EF', '\u30F1', '\u30F3', '\u309B', '\u309C'}; public unsafe override int GetBytesImpl ( char* chars, int charCount, byte* bytes, int byteCount, bool flush) { int charIndex = 0; int byteIndex = 0; #if NET_2_0 EncoderFallbackBuffer buffer = null; #endif int start = byteIndex; int end = charIndex + charCount; int value; for (int i = charIndex; i < end; i++, charCount--) { char ch = chars [i]; // When half-kana is not allowed and it is // actually in the input, convert to full width // kana. if (!allow_1byte_kana && ch >= 0xFF60 && ch <= 0xFFA0) ch = full_width_map [ch - 0xFF60]; if (ch >= 0x2010 && ch <= 0x9FA5) { if (shifted_in) { bytes [byteIndex++] = 0x0F; shifted_in = false; byteCount--; } switch (m) { case ISO2022JPMode.JISX0208: break; default: SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0208); break; } // This range contains the bulk of the CJK set. value = (ch - 0x2010) * 2; value = ((int)(convert.cjkToJis[value])) | (((int)(convert.cjkToJis[value + 1])) << 8); } else if (ch >= 0xFF01 && ch <= 0xFF60) { if (shifted_in) { bytes [byteIndex++] = 0x0F; shifted_in = false; byteCount--; } switch (m) { case ISO2022JPMode.JISX0208: break; default: SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0208); break; } // This range contains extra characters, value = (ch - 0xFF01) * 2; value = ((int)(convert.extraToJis[value])) | (((int)(convert.extraToJis[value + 1])) << 8); } else if (ch >= 0xFF60 && ch <= 0xFFA0) { // disallowed half-width kana is // already converted to full-width kana // so here we don't have to consider it. if (allow_shift_io) { if (!shifted_in) { bytes [byteIndex++] = 0x0E; shifted_in = true; byteCount--; } } else { switch (m) { case ISO2022JPMode.JISX0201: break; default: SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.JISX0201); break; } } value = ch - 0xFF40; } else if (ch < 128) { if (shifted_in) { bytes [byteIndex++] = 0x0F; shifted_in = false; byteCount--; } SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.ASCII); value = (int) ch; } else { #if NET_2_0 HandleFallback ( chars, ref i, ref charCount, bytes, ref byteIndex, ref byteCount); #endif // skip non-convertible character continue; } //Console.WriteLine ("{0:X04} : {1:x02} {2:x02}", v, (int) v / 94 + 33, v % 94 + 33); if (value > 0x100) { value -= 0x0100; bytes [byteIndex++] = (byte) (value / 94 + 33); bytes [byteIndex++] = (byte) (value % 94 + 33); byteCount -= 2; } else { bytes [byteIndex++] = (byte) value; byteCount--; } } if (flush) { // must end in ASCII mode if (shifted_in) { bytes [byteIndex++] = 0x0F; shifted_in = false; byteCount--; } if (m != ISO2022JPMode.ASCII) SwitchMode (bytes, ref byteIndex, ref byteCount, ref m, ISO2022JPMode.ASCII); } return byteIndex - start; } #if NET_2_0 public override void Reset () { m = ISO2022JPMode.ASCII; shifted_in = false; } #endif } internal class ISO2022JPDecoder : Decoder { static JISConvert convert = JISConvert.Convert; readonly bool allow_shift_io; ISO2022JPMode m = ISO2022JPMode.ASCII; bool shifted_in; public ISO2022JPDecoder (bool allow1ByteKana, bool allowShiftIO) { this.allow_shift_io = allowShiftIO; } // GetCharCount public override int GetCharCount (byte [] bytes, int index, int count) { int ret = 0; int end = index + count; for (int i = index; i < end; i++) { if (allow_shift_io) { switch (bytes [i]) { case 0x0F: shifted_in = false; continue; case 0x0E: shifted_in = true; continue; } } if (bytes [i] != 0x1B) { if (!shifted_in && m == ISO2022JPMode.JISX0208) { if (i + 1 == end) break; // incomplete head of wide char else ret++; i++; // 2 byte char } else ret++; // half-kana or ASCII } else { if (i + 2 >= end) break; // incomplete escape sequence i++; bool wide = false; if (bytes [i] == 0x24) wide = true; else if (bytes [i] == 0x28) wide = false; else throw new ArgumentException ("Unexpected ISO-2022-JP escape sequence."); i++; if (bytes [i] == 0x42) m = wide ? ISO2022JPMode.JISX0208 : ISO2022JPMode.ASCII; else if (bytes [i] == 0x49) m = ISO2022JPMode.JISX0201; else throw new ArgumentException (String.Format ("Unexpected ISO-2022-JP escape sequence. Ended with 0x{0:X04}", bytes [i])); } } return ret; } private int ToChar (int value) { value <<= 1; return value + 1 >= convert.jisx0208ToUnicode.Length ? -1 : ((int) (convert.jisx0208ToUnicode [value])) | (((int) (convert.jisx0208ToUnicode [value + 1])) << 8); } public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex) { int start = charIndex; int end = byteIndex + byteCount; for (int i = byteIndex; i < end && charIndex < chars.Length; i++) { if (allow_shift_io) { switch (bytes [i]) { case 0x0F: shifted_in = false; continue; case 0x0E: shifted_in = true; continue; } } if (bytes [i] != 0x1B) { if (shifted_in || m == ISO2022JPMode.JISX0201) { // half-kana if (bytes [i] < 0x60) chars [charIndex++] = (char) (bytes [i] + 0xFF40); else // invalid chars [charIndex++] = '?'; } else if (m == ISO2022JPMode.JISX0208) { if (i + 1 == end) break; // incomplete head of wide char // am so lazy, so reusing jis2sjis int s1 = ((bytes [i] - 1) >> 1) + ((bytes [i] <= 0x5e) ? 0x71 : 0xb1); int s2 = bytes [i + 1] + (((bytes [i] & 1) != 0) ? 0x20 : 0x7e); int v = (s1 - 0x81) * 0xBC; v += s2 - 0x41; int ch = ToChar (v); if (ch < 0) chars [charIndex++] = '?'; else chars [charIndex++] = (char) ch; i++; } // LAMESPEC: actually this should not // be allowed when 1byte-kana is not // allowed, but MS.NET seems to allow // it in any mode. else if (bytes [i] > 0xA0 && bytes [i] < 0xE0) // half-width Katakana chars [charIndex++] = (char) (bytes [i] - 0xA0 + 0xFF60); else chars [charIndex++] = (char) bytes [i]; continue; } else { if (i + 2 >= end) break; // incomplete escape sequence i++; bool wide = false; if (bytes [i] == 0x24) wide = true; else if (bytes [i] == 0x28) wide = false; else throw new ArgumentException ("Unexpected ISO-2022-JP escape sequence."); i++; if (bytes [i] == 0x42) m = wide ? ISO2022JPMode.JISX0208 : ISO2022JPMode.ASCII; else if (bytes [i] == 0x49) m = ISO2022JPMode.JISX0201; else throw new ArgumentException (String.Format ("Unexpected ISO-2022-JP escape sequence. Ended with 0x{0:X04}", bytes [i])); } } return charIndex - start; } #if NET_2_0 public override void Reset () { m = ISO2022JPMode.ASCII; shifted_in = false; } #endif } public class ENCiso_2022_jp : CP50220 { public ENCiso_2022_jp () : base() {} }; // class ENCiso_2022_jp }