/* * CP51932.cs - Japanese EUC-JP code page. * * It is based on CP932.cs from Portable.NET * * Author: * Atsushi Enomoto * * Below are original (CP932.cs) copyright lines * * (C)2004 Novell Inc. * * Copyright (c) 2002 Southern Storm Software, Pty Ltd * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ /* Well, there looks no jis.table source. Thus, it seems like it is generated from text files from Unicode Home Page such like ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT However, it is non-normative and in Japan it is contains many problem. FIXME: Some characters such as 0xFF0B (wide "plus") are missing in that table. */ /* 0x00-0x1F, 0x7F : control characters 0x20-0x7E : ASCII 0xA1A1-0xFEFE : Kanji (precisely, both bytes contain only A1-FE) 0x8EA1-0x8EDF : half-width Katakana 0x8FA1A1-0x8FFEFE : Complemental Kanji */ namespace I18N.CJK { using System; using System.Text; using I18N.Common; [Serializable] public class CP51932 : MonoEncoding { // Magic number used by Windows for the EUC-JP code page. private const int EUC_JP_CODE_PAGE = 51932; // Constructor. public CP51932 () : base (EUC_JP_CODE_PAGE, 932) { } public override int GetByteCount (char [] chars, int index, int length) { return new CP51932Encoder (this).GetByteCount (chars, index, length, true); } public unsafe override int GetByteCountImpl (char* chars, int count) { return new CP51932Encoder (this).GetByteCountImpl (chars, count, true); } public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount) { return new CP51932Encoder (this).GetBytesImpl (chars, charCount, bytes, byteCount, true); } public override int GetCharCount (byte [] bytes, int index, int count) { #if NET_2_0 return new CP51932Decoder ().GetCharCount ( bytes, index, count, true); #else return new CP51932Decoder ().GetCharCount ( bytes, index, count); #endif } public override int GetChars ( byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex) { #if NET_2_0 return new CP51932Decoder ().GetChars (bytes, byteIndex, byteCount, chars, charIndex, true); #else return new CP51932Decoder ().GetChars (bytes, byteIndex, byteCount, chars, charIndex); #endif } // Get the maximum number of bytes needed to encode a // specified number of characters. public override int GetMaxByteCount(int charCount) { if(charCount < 0) { throw new ArgumentOutOfRangeException ("charCount", Strings.GetString("ArgRange_NonNegative")); } return charCount * 3; } // Get the maximum number of characters needed to decode a // specified number of bytes. public override int GetMaxCharCount(int byteCount) { if(byteCount < 0) { throw new ArgumentOutOfRangeException ("byteCount", Strings.GetString ("ArgRange_NonNegative")); } return byteCount; } public override Encoder GetEncoder () { return new CP51932Encoder (this); } public override Decoder GetDecoder () { return new CP51932Decoder (); } #if !ECMA_COMPAT // Get the mail body name for this encoding. public override String BodyName { get { return "euc-jp"; } } // Get the human-readable name for this encoding. public override String EncodingName { get { return "Japanese (EUC)"; } } // Get the mail agent header name for this encoding. public override String HeaderName { get { return "euc-jp"; } } // Determine if this encoding can be displayed in a Web browser. public override bool IsBrowserDisplay { get { return true; } } // Determine if this encoding can be saved from a Web browser. public override bool IsBrowserSave { get { return true; } } // Determine if this encoding can be displayed in a mail/news agent. public override bool IsMailNewsDisplay { get { return true; } } // Determine if this encoding can be saved from a mail/news agent. public override bool IsMailNewsSave { get { return true; } } // Get the IANA-preferred Web name for this encoding. public override String WebName { get { return "euc-jp"; } } } // CP51932 #endif // !ECMA_COMPAT public class CP51932Encoder : MonoEncoder { public CP51932Encoder (MonoEncoding encoding) : base (encoding) { } // Get the number of bytes needed to encode a character buffer. public unsafe override int GetByteCountImpl ( char* chars, int count, bool refresh) { // Determine the length of the final output. int index = 0; int length = 0; int ch, value; byte [] cjkToJis = JISConvert.Convert.cjkToJis; byte [] extraToJis = JISConvert.Convert.extraToJis; while (count > 0) { ch = chars [index++]; --count; ++length; if (ch < 0x0080) { // Character maps to itself. continue; } else if (ch < 0x0100) { // Check for special Latin 1 characters that // can be mapped to double-byte code points. if(ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 || ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 || ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 || ch == 0x00D7 || ch == 0x00F7) { ++length; } } else if (ch >= 0x0391 && ch <= 0x0451) { // Greek subset characters. ++length; } else if (ch >= 0x2010 && ch <= 0x9FA5) { // This range contains the bulk of the CJK set. value = (ch - 0x2010) * 2; value = ((int) (cjkToJis[value])) | (((int)(cjkToJis[value + 1])) << 8); if(value >= 0x0100) ++length; } else if(ch >= 0xFF01 && ch < 0xFF60) { // This range contains extra characters. value = (ch - 0xFF01) * 2; value = ((int)(extraToJis[value])) | (((int)(extraToJis[value + 1])) << 8); if(value >= 0x0100) ++length; } else if(ch >= 0xFF60 && ch <= 0xFFA0) { ++length; // half-width kana } } // Return the length to the caller. return length; } // Get the bytes that result from encoding a character buffer. public unsafe override int GetBytesImpl ( char* chars, int charCount, byte* bytes, int byteCount, bool refresh) { int charIndex = 0; int byteIndex = 0; // Convert the characters into their byte form. int posn = byteIndex; int byteLength = byteCount; int ch, value; byte[] cjkToJis = JISConvert.Convert.cjkToJis; byte[] greekToJis = JISConvert.Convert.greekToJis; byte[] extraToJis = JISConvert.Convert.extraToJis; for (; charCount > 0; charIndex++, --charCount) { ch = chars [charIndex]; if (posn >= byteLength) { throw new ArgumentException (Strings.GetString ("Arg_InsufficientSpace"), "bytes"); } if (ch < 0x0080) { // Character maps to itself. bytes[posn++] = (byte)ch; continue; } else if (ch >= 0x0391 && ch <= 0x0451) { // Greek subset characters. value = (ch - 0x0391) * 2; value = ((int)(greekToJis[value])) | (((int)(greekToJis[value + 1])) << 8); } else if (ch >= 0x2010 && ch <= 0x9FA5) { // This range contains the bulk of the CJK set. value = (ch - 0x2010) * 2; value = ((int) (cjkToJis[value])) | (((int)(cjkToJis[value + 1])) << 8); } else if (ch >= 0xFF01 && ch <= 0xFF60) { // This range contains extra characters, // including half-width katakana. value = (ch - 0xFF01) * 2; value = ((int) (extraToJis [value])) | (((int) (extraToJis [value + 1])) << 8); } else if (ch >= 0xFF60 && ch <= 0xFFA0) { value = ch - 0xFF60 + 0x8EA0; } else { // Invalid character. value = 0; } if (value == 0) { #if NET_2_0 HandleFallback ( chars, ref charIndex, ref charCount, bytes, ref posn, ref byteCount); #else bytes [posn++] = (byte) '?'; #endif } else if (value < 0x0100) { bytes [posn++] = (byte) value; } else if ((posn + 1) >= byteLength) { throw new ArgumentException (Strings.GetString ("Arg_InsufficientSpace"), "bytes"); } else if (value < 0x8000) { // general 2byte glyph/kanji value -= 0x0100; bytes [posn++] = (byte) (value / 0x5E + 0xA1); bytes [posn++] = (byte) (value % 0x5E + 0xA1); //Console.WriteLine ("{0:X04}", ch); continue; } else { // half-width kana bytes [posn++] = 0x8E; bytes [posn++] = (byte) (value - 0x8E00); } } // Return the final length to the caller. return posn - byteIndex; } } // CP51932Encoder internal class CP51932Decoder : DbcsEncoding.DbcsDecoder { public CP51932Decoder () : base (null) { } int last_count, last_bytes; // Get the number of characters needed to decode a byte buffer. public override int GetCharCount (byte [] bytes, int index, int count) { return GetCharCount (bytes, index, count, false); } #if NET_2_0 public override #else internal #endif int GetCharCount (byte [] bytes, int index, int count, bool refresh) { CheckRange (bytes, index, count); // Determine the total length of the converted string. int value = 0; byte[] table0208 = JISConvert.Convert.jisx0208ToUnicode; byte[] table0212 = JISConvert.Convert.jisx0212ToUnicode; int length = 0; int byteval = 0; int last = last_count; while (count > 0) { byteval = bytes [index++]; --count; if (last == 0) { if (byteval == 0x8F) { if (byteval != 0) { // Invalid second byte of a 3-byte character. last = 0; length++; } // First byte in a triple-byte sequence else last = byteval; } else if (byteval <= 0x7F) { // Ordinary ASCII/Latin1/Control character. length++; } else if (byteval == 0x8E) { // First byte of half-width Katakana last = byteval; } else if (byteval >= 0xA1 && byteval <= 0xFE) { // First byte in a double-byte sequence. last = byteval; } else { // Invalid first byte. length++; } } else if (last == 0x8E) { if (byteval >= 0xA1 && byteval <= 0xDF) { value = ((byteval - 0x40) | (last + 0x71) << 8); length++; } else { // Invalid second byte. length++; } last =0; } else if (last == 0x8F) { // 3-byte character // FIXME: currently not supported yet last = byteval; } else { // Second byte in a double-byte sequence. value = (last - 0xA1) * 0x5E; last = 0; if (byteval >= 0xA1 && byteval <= 0xFE) { value += (byteval - 0xA1); } else { // Invalid second byte. last = 0; length++; continue; } value *= 2; value = ((int) (table0208 [value])) | (((int) (table0208 [value + 1])) << 8); if (value == 0) value = ((int) (table0212 [value])) | (((int) (table0212 [value + 1])) << 8); if (value != 0) length++; else length++; } } // seems like .NET 2.0 adds \u30FB for insufficient // byte seuqence (for Japanese \u30FB makes sense). if (refresh && last != 0) length++; else last_count = last; // Return the final length to the caller. return length; } public override int GetChars (byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex) { return GetChars (bytes, byteIndex, byteCount, chars, charIndex, false); } #if NET_2_0 public override #else internal #endif int GetChars (byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex, bool refresh) { CheckRange (bytes, byteIndex, byteCount, chars, charIndex); // Decode the bytes in the buffer. int posn = charIndex; int charLength = chars.Length; int byteval, value; int last = last_bytes; byte[] table0208 = JISConvert.Convert.jisx0208ToUnicode; byte[] table0212 = JISConvert.Convert.jisx0212ToUnicode; while (byteCount > 0) { byteval = bytes [byteIndex++]; --byteCount; if (last == 0) { if (byteval == 0x8F) { if (byteval != 0) { // Invalid second byte of a 3-byte character. last = 0; if (posn >= charLength) throw Insufficient (); chars [posn++] = '\u30FB'; } // First byte in a triple-byte sequence else last = byteval; } else if (byteval <= 0x7F) { // Ordinary ASCII/Latin1/Control character. if (posn >= charLength) throw Insufficient (); chars [posn++] = (char) byteval; } else if (byteval == 0x8E) { // First byte of half-width Katakana last = byteval; } else if (byteval >= 0xA1 && byteval <= 0xFE) { // First byte in a double-byte sequence. last = byteval; } else { // Invalid first byte. if (posn >= charLength) throw Insufficient (); chars [posn++] = '\u30FB'; } } else if (last == 0x8E) { if (byteval >= 0xA1 && byteval <= 0xDF) { value = ((byteval - 0x40) | (last + 0x71) << 8); if (posn >= charLength) throw Insufficient (); chars [posn++] = (char) value; } else { // Invalid second byte. if (posn >= charLength) throw Insufficient (); chars [posn++] = '\u30FB'; } last =0; } else if (last == 0x8F) { // 3-byte character // FIXME: currently not supported yet last = byteval; } else { // Second byte in a double-byte sequence. value = (last - 0xA1) * 0x5E; last = 0; if (byteval >= 0xA1 && byteval <= 0xFE) { value += (byteval - 0xA1); } else { // Invalid second byte. last = 0; if (posn >= charLength) throw Insufficient (); chars [posn++] = '\u30FB'; continue; } value *= 2; value = ((int) (table0208 [value])) | (((int) (table0208 [value + 1])) << 8); if (value == 0) value = ((int) (table0212 [value])) | (((int) (table0212 [value + 1])) << 8); if (posn >= charLength) throw Insufficient (); if (value != 0) chars [posn++] = (char)value; else chars [posn++] = '\u30FB'; } } if (refresh && last != 0) { // seems like .NET 2.0 adds \u30FB for insufficient // byte seuqence (for Japanese \u30FB makes sense). if (posn >= charLength) throw Insufficient (); chars [posn++] = '\u30FB'; } else last_bytes = last; // Return the final length to the caller. return posn - charIndex; } Exception Insufficient () { throw new ArgumentException (Strings.GetString ("Arg_InsufficientSpace"), "chars"); } }; // class CP51932Decoder [Serializable] public class ENCeuc_jp : CP51932 { public ENCeuc_jp () : base() {} }; // class ENCeucjp }; // namespace I18N.CJK