2 * CP51932.cs - Japanese EUC-JP code page.
4 * It is based on CP932.cs from Portable.NET
7 * Atsushi Enomoto <atsushi@ximian.com>
9 * Below are original (CP932.cs) copyright lines
13 * Copyright (c) 2002 Southern Storm Software, Pty Ltd
15 * Permission is hereby granted, free of charge, to any person obtaining
16 * a copy of this software and associated documentation files (the "Software"),
17 * to deal in the Software without restriction, including without limitation
18 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
19 * and/or sell copies of the Software, and to permit persons to whom the
20 * Software is furnished to do so, subject to the following conditions:
22 * The above copyright notice and this permission notice shall be included
23 * in all copies or substantial portions of the Software.
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
26 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
28 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
29 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
30 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
31 * OTHER DEALINGS IN THE SOFTWARE.
36 Well, there looks no jis.table source. Thus, it seems like it is
37 generated from text files from Unicode Home Page such like
38 ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT
39 However, it is non-normative and in Japan it is contains many problem.
41 FIXME: Some characters such as 0xFF0B (wide "plus") are missing in
46 0x00-0x1F, 0x7F : control characters
48 0xA1A1-0xFEFE : Kanji (precisely, both bytes contain only A1-FE)
49 0x8EA1-0x8EDF : half-width Katakana
50 0x8FA1A1-0x8FFEFE : Complemental Kanji
62 using MonoEncoder = I18N.Common.MonoSafeEncoder;
63 using MonoEncoding = I18N.Common.MonoSafeEncoding;
67 public class CP51932 : MonoEncoding
69 // Magic number used by Windows for the EUC-JP code page.
70 private const int EUC_JP_CODE_PAGE = 51932;
73 public CP51932 () : base (EUC_JP_CODE_PAGE, 932)
78 public unsafe override int GetByteCountImpl (char* chars, int count)
80 return new CP51932Encoder (this).GetByteCountImpl (chars, count, true);
83 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount)
85 return new CP51932Encoder (this).GetBytesImpl (chars, charCount, bytes, byteCount, true);
88 public override int GetByteCount (char [] chars, int index, int length)
90 return new CP51932Encoder (this).GetByteCount (chars, index, length, true);
93 public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)
95 return new CP51932Encoder (this).GetBytes (chars, charIndex, charCount, bytes, byteIndex, true);
99 public override int GetCharCount (byte [] bytes, int index, int count)
101 return new CP51932Decoder ().GetCharCount (
102 bytes, index, count, true);
105 public override int GetChars (
106 byte [] bytes, int byteIndex, int byteCount,
107 char [] chars, int charIndex)
109 return new CP51932Decoder ().GetChars (bytes,
110 byteIndex, byteCount, chars, charIndex, true);
113 // Get the maximum number of bytes needed to encode a
114 // specified number of characters.
115 public override int GetMaxByteCount(int charCount)
119 throw new ArgumentOutOfRangeException
121 Strings.GetString("ArgRange_NonNegative"));
123 return charCount * 3;
126 // Get the maximum number of characters needed to decode a
127 // specified number of bytes.
128 public override int GetMaxCharCount(int byteCount)
132 throw new ArgumentOutOfRangeException
134 Strings.GetString ("ArgRange_NonNegative"));
139 public override Encoder GetEncoder ()
141 return new CP51932Encoder (this);
144 public override Decoder GetDecoder ()
146 return new CP51932Decoder ();
151 // Get the mail body name for this encoding.
152 public override String BodyName {
153 get { return "euc-jp"; }
156 // Get the human-readable name for this encoding.
157 public override String EncodingName {
158 get { return "Japanese (EUC)"; }
161 // Get the mail agent header name for this encoding.
162 public override String HeaderName {
163 get { return "euc-jp"; }
166 // Determine if this encoding can be displayed in a Web browser.
167 public override bool IsBrowserDisplay {
171 // Determine if this encoding can be saved from a Web browser.
172 public override bool IsBrowserSave {
176 // Determine if this encoding can be displayed in a mail/news agent.
177 public override bool IsMailNewsDisplay {
181 // Determine if this encoding can be saved from a mail/news agent.
182 public override bool IsMailNewsSave {
186 // Get the IANA-preferred Web name for this encoding.
187 public override String WebName {
188 get { return "euc-jp"; }
191 #endif // !ECMA_COMPAT
193 public class CP51932Encoder : MonoEncoder
195 public CP51932Encoder (MonoEncoding encoding)
201 // Get the number of bytes needed to encode a character buffer.
202 public unsafe override int GetByteCountImpl (
203 char* chars, int count, bool refresh)
205 // Determine the length of the final output.
209 byte [] cjkToJis = JISConvert.Convert.cjkToJis;
210 byte [] extraToJis = JISConvert.Convert.extraToJis;
213 ch = chars [index++];
217 // Character maps to itself.
219 } else if (ch < 0x0100) {
220 // Check for special Latin 1 characters that
221 // can be mapped to double-byte code points.
222 if(ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
223 ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
224 ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
225 ch == 0x00D7 || ch == 0x00F7)
229 } else if (ch >= 0x0391 && ch <= 0x0451) {
230 // Greek subset characters.
232 } else if (ch >= 0x2010 && ch <= 0x9FA5) {
233 // This range contains the bulk of the CJK set.
234 value = (ch - 0x2010) * 2;
235 value = ((int) (cjkToJis[value])) | (((int)(cjkToJis[value + 1])) << 8);
238 } else if(ch >= 0xFF01 && ch < 0xFF60) {
239 // This range contains extra characters.
240 value = (ch - 0xFF01) * 2;
241 value = ((int)(extraToJis[value])) |
242 (((int)(extraToJis[value + 1])) << 8);
245 } else if(ch >= 0xFF60 && ch <= 0xFFA0) {
246 ++length; // half-width kana
250 // Return the length to the caller.
254 // Get the bytes that result from encoding a character buffer.
255 public unsafe override int GetBytesImpl (
256 char* chars, int charCount, byte* bytes, int byteCount, bool refresh)
262 // Convert the characters into their byte form.
263 int posn = byteIndex;
264 int byteLength = byteCount;
267 byte[] cjkToJis = JISConvert.Convert.cjkToJis;
268 byte[] greekToJis = JISConvert.Convert.greekToJis;
269 byte[] extraToJis = JISConvert.Convert.extraToJis;
271 for (int i = charIndex; i < end; i++, charCount--) {
273 if (posn >= byteLength) {
274 throw new ArgumentException (Strings.GetString ("Arg_InsufficientSpace"), "bytes");
278 // Character maps to itself.
279 bytes[posn++] = (byte)ch;
281 } else if (ch >= 0x0391 && ch <= 0x0451) {
282 // Greek subset characters.
283 value = (ch - 0x0391) * 2;
284 value = ((int)(greekToJis[value])) |
285 (((int)(greekToJis[value + 1])) << 8);
286 } else if (ch >= 0x2010 && ch <= 0x9FA5) {
287 // This range contains the bulk of the CJK set.
288 value = (ch - 0x2010) * 2;
289 value = ((int) (cjkToJis[value])) |
290 (((int)(cjkToJis[value + 1])) << 8);
291 } else if (ch >= 0xFF01 && ch <= 0xFF60) {
292 // This range contains extra characters,
293 // including half-width katakana.
294 value = (ch - 0xFF01) * 2;
295 value = ((int) (extraToJis [value])) |
296 (((int) (extraToJis [value + 1])) << 8);
297 } else if (ch >= 0xFF60 && ch <= 0xFFA0) {
298 value = ch - 0xFF60 + 0x8EA0;
300 // Invalid character.
306 chars, ref i, ref charCount,
307 bytes, ref posn, ref byteCount, null);
308 } else if (value < 0x0100) {
309 bytes [posn++] = (byte) value;
310 } else if ((posn + 1) >= byteLength) {
311 throw new ArgumentException (Strings.GetString ("Arg_InsufficientSpace"), "bytes");
312 } else if (value < 0x8000) {
313 // general 2byte glyph/kanji
315 bytes [posn++] = (byte) (value / 0x5E + 0xA1);
316 bytes [posn++] = (byte) (value % 0x5E + 0xA1);
317 //Console.WriteLine ("{0:X04}", ch);
323 bytes [posn++] = 0x8E;
324 bytes [posn++] = (byte) (value - 0x8E00);
328 // Return the final length to the caller.
329 return posn - byteIndex;
332 // Get the number of bytes needed to encode a character buffer.
333 public override int GetByteCount(char[] chars, int index, int count, bool flush)
335 // Determine the length of the final output.
338 byte[] cjkToJis = JISConvert.Convert.cjkToJis;
339 byte[] extraToJis = JISConvert.Convert.extraToJis;
348 // Character maps to itself.
351 else if (ch < 0x0100)
353 // Check for special Latin 1 characters that
354 // can be mapped to double-byte code points.
355 if (ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
356 ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
357 ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
358 ch == 0x00D7 || ch == 0x00F7)
363 else if (ch >= 0x0391 && ch <= 0x0451)
365 // Greek subset characters.
368 else if (ch >= 0x2010 && ch <= 0x9FA5)
370 // This range contains the bulk of the CJK set.
371 value = (ch - 0x2010) * 2;
372 value = ((int)(cjkToJis[value])) | (((int)(cjkToJis[value + 1])) << 8);
376 else if (ch >= 0xFF01 && ch < 0xFF60)
378 // This range contains extra characters.
379 value = (ch - 0xFF01) * 2;
380 value = ((int)(extraToJis[value])) |
381 (((int)(extraToJis[value + 1])) << 8);
385 else if (ch >= 0xFF60 && ch <= 0xFFA0)
387 ++length; // half-width kana
391 // Return the length to the caller.
395 // Get the bytes that result from encoding a character buffer.
396 public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool flush)
398 // Convert the characters into their byte form.
399 int posn = byteIndex;
400 int byteLength = bytes.Length;
401 int byteCount = bytes.Length;
402 int end = charIndex + charCount;
405 byte[] cjkToJis = JISConvert.Convert.cjkToJis;
406 byte[] greekToJis = JISConvert.Convert.greekToJis;
407 byte[] extraToJis = JISConvert.Convert.extraToJis;
409 for (int i = charIndex; i < end; i++, charCount--)
412 if (posn >= byteLength)
414 throw new ArgumentException(Strings.GetString("Arg_InsufficientSpace"), "bytes");
419 // Character maps to itself.
420 bytes[posn++] = (byte)ch;
423 else if (ch >= 0x0391 && ch <= 0x0451)
425 // Greek subset characters.
426 value = (ch - 0x0391) * 2;
427 value = ((int)(greekToJis[value])) |
428 (((int)(greekToJis[value + 1])) << 8);
430 else if (ch >= 0x2010 && ch <= 0x9FA5)
432 // This range contains the bulk of the CJK set.
433 value = (ch - 0x2010) * 2;
434 value = ((int)(cjkToJis[value])) |
435 (((int)(cjkToJis[value + 1])) << 8);
437 else if (ch >= 0xFF01 && ch <= 0xFF60)
439 // This range contains extra characters,
440 // including half-width katakana.
441 value = (ch - 0xFF01) * 2;
442 value = ((int)(extraToJis[value])) |
443 (((int)(extraToJis[value + 1])) << 8);
445 else if (ch >= 0xFF60 && ch <= 0xFFA0)
447 value = ch - 0xFF60 + 0x8EA0;
451 // Invalid character.
457 HandleFallback (chars, ref i, ref charCount,
458 bytes, ref posn, ref byteCount, null);
460 else if (value < 0x0100)
462 bytes[posn++] = (byte)value;
464 else if ((posn + 1) >= byteLength)
466 throw new ArgumentException(Strings.GetString("Arg_InsufficientSpace"), "bytes");
468 else if (value < 0x8000)
470 // general 2byte glyph/kanji
472 bytes[posn++] = (byte)(value / 0x5E + 0xA1);
473 bytes[posn++] = (byte)(value % 0x5E + 0xA1);
474 //Console.WriteLine ("{0:X04}", ch);
480 bytes[posn++] = 0x8E;
481 bytes[posn++] = (byte)(value - 0x8E00);
485 // Return the final length to the caller.
486 return posn - byteIndex;
491 internal class CP51932Decoder : DbcsEncoding.DbcsDecoder
493 public CP51932Decoder ()
498 int last_count, last_bytes;
500 // Get the number of characters needed to decode a byte buffer.
501 public override int GetCharCount (byte [] bytes, int index, int count)
503 return GetCharCount (bytes, index, count, false);
507 int GetCharCount (byte [] bytes, int index, int count, bool refresh)
509 CheckRange (bytes, index, count);
511 // Determine the total length of the converted string.
513 byte[] table0208 = JISConvert.Convert.jisx0208ToUnicode;
514 byte[] table0212 = JISConvert.Convert.jisx0212ToUnicode;
517 int last = last_count;
520 byteval = bytes [index++];
523 if (byteval == 0x8F) {
524 // SS3: One-time triple-byte sequence should follow.
526 } else if (byteval <= 0x7F) {
527 // Ordinary ASCII/Latin1/Control character.
529 } else if (byteval == 0x8E) {
530 // SS2: One-time double-byte sequence should follow.
532 } else if (byteval >= 0xA1 && byteval <= 0xFE) {
533 // First byte in a double-byte sequence.
536 // Invalid first byte.
540 else if (last == 0x8E) {
541 // SS2 (One-time double-byte sequence)
542 if (byteval >= 0xA1 && byteval <= 0xDF) {
545 // Invalid second byte.
550 else if (last == 0x8F) {
551 // SS3: 3-byte character
552 // FIXME: not supported (I don't think iso-2022-jp has)
557 // Second byte in a double-byte sequence.
558 value = (last - 0xA1) * 0x5E;
560 if (byteval >= 0xA1 && byteval <= 0xFE)
562 value += (byteval - 0xA1);
566 // Invalid second byte.
573 value = ((int) (table0208 [value]))
574 | (((int) (table0208 [value + 1])) << 8);
576 value = ((int) (table0212 [value]))
577 | (((int) (table0212 [value + 1])) << 8);
585 // seems like .NET 2.0 adds \u30FB for insufficient
586 // byte seuqence (for Japanese \u30FB makes sense).
587 if (refresh && last != 0)
592 // Return the final length to the caller.
596 public override int GetChars (byte[] bytes, int byteIndex,
597 int byteCount, char[] chars,
600 return GetChars (bytes, byteIndex, byteCount, chars, charIndex, false);
604 int GetChars (byte[] bytes, int byteIndex,
605 int byteCount, char[] chars,
606 int charIndex, bool refresh)
608 CheckRange (bytes, byteIndex, byteCount, chars, charIndex);
610 // Decode the bytes in the buffer.
611 int posn = charIndex;
612 int charLength = chars.Length;
614 int last = last_bytes;
615 byte[] table0208 = JISConvert.Convert.jisx0208ToUnicode;
616 byte[] table0212 = JISConvert.Convert.jisx0212ToUnicode;
618 while (byteCount > 0) {
619 byteval = bytes [byteIndex++];
622 if (byteval == 0x8F) {
623 // SS3 (One-time triple-byte sequence) should follow.
625 } else if (byteval <= 0x7F) {
626 // Ordinary ASCII/Latin1/Control character.
627 if (posn >= charLength)
628 throw Insufficient ();
629 chars [posn++] = (char) byteval;
630 } else if (byteval == 0x8E) {
631 // SS2 (One-time double-byte sequence) should follow.
633 } else if (byteval >= 0xA1 && byteval <= 0xFE) {
634 // First byte in a double-byte sequence.
637 // Invalid first byte.
638 if (posn >= charLength)
639 throw Insufficient ();
640 chars [posn++] = '\u30FB';
643 else if (last == 0x8E) {
644 // SS2 (One-time double-byte sequence)
645 if (byteval >= 0xA1 && byteval <= 0xDF) {
646 value = ((byteval - 0x40) |
648 if (posn >= charLength)
649 throw Insufficient ();
650 chars [posn++] = (char) value;
652 // Invalid second byte.
653 if (posn >= charLength)
654 throw Insufficient ();
655 chars [posn++] = '\u30FB';
659 else if (last == 0x8F) {
660 // SS3: 3-byte character
661 // FIXME: not supported (I don't think iso-2022-jp has)
666 // Second byte in a double-byte sequence.
667 value = (last - 0xA1) * 0x5E;
669 if (byteval >= 0xA1 && byteval <= 0xFE)
671 value += (byteval - 0xA1);
675 // Invalid second byte.
677 if (posn >= charLength)
678 throw Insufficient ();
679 chars [posn++] = '\u30FB';
684 value = ((int) (table0208 [value]))
685 | (((int) (table0208 [value + 1])) << 8);
687 value = ((int) (table0212 [value]))
688 | (((int) (table0212 [value + 1])) << 8);
689 if (posn >= charLength)
690 throw Insufficient ();
692 chars [posn++] = (char)value;
694 chars [posn++] = '\u30FB';
698 if (refresh && last != 0) {
699 // seems like .NET 2.0 adds \u30FB for insufficient
700 // byte seuqence (for Japanese \u30FB makes sense).
701 if (posn >= charLength)
702 throw Insufficient ();
703 chars [posn++] = '\u30FB';
708 // Return the final length to the caller.
709 return posn - charIndex;
712 Exception Insufficient ()
714 throw new ArgumentException
716 ("Arg_InsufficientSpace"), "chars");
718 }; // class CP51932Decoder
721 public class ENCeuc_jp : CP51932
723 public ENCeuc_jp () : base() {}
727 }; // namespace I18N.CJK