2 * CP932.cs - Japanese (Shift-JIS) code page.
4 * Copyright (c) 2002 Southern Storm Software, Pty Ltd
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
26 // Copyright (C) 2005-2006 Novell, Inc.
37 using MonoEncoder = I18N.Common.MonoSafeEncoder;
38 using MonoEncoding = I18N.Common.MonoSafeEncoding;
42 public class CP932 : MonoEncoding
44 // Magic number used by Windows for the Shift-JIS code page.
45 private const int SHIFTJIS_CODE_PAGE = 932;
48 public CP932() : base(SHIFTJIS_CODE_PAGE)
53 // Get the number of bytes needed to encode a character buffer.
54 public unsafe override int GetByteCountImpl (char* chars, int count)
58 // Determine the length of the final output.
62 byte *cjkToJis = JISConvert.Convert.cjkToJis;
63 byte *extraToJis = JISConvert.Convert.extraToJis;
65 byte[] cjkToJis = JISConvert.Convert.cjkToJis;
66 byte[] extraToJis = JISConvert.Convert.extraToJis;
75 // Character maps to itself.
80 // Check for special Latin 1 characters that
81 // can be mapped to double-byte code points.
82 if(ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
83 ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
84 ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
85 ch == 0x00D7 || ch == 0x00F7)
90 else if(ch >= 0x0391 && ch <= 0x0451)
92 // Greek subset characters.
95 else if(ch >= 0x2010 && ch <= 0x9FA5)
97 // This range contains the bulk of the CJK set.
98 value = (ch - 0x2010) * 2;
99 value = ((int)(cjkToJis[value])) |
100 (((int)(cjkToJis[value + 1])) << 8);
106 else if(ch >= 0xE000 && ch <= 0xE757)
109 else if(ch >= 0xFF01 && ch <= 0xFFEF)
111 // This range contains extra characters,
112 // including half-width katakana.
113 value = (ch - 0xFF01) * 2;
114 value = ((int)(extraToJis[value])) |
115 (((int)(extraToJis[value + 1])) << 8);
123 // Return the length to the caller.
127 // Get the bytes that result from encoding a character buffer.
128 public unsafe override int GetBytesImpl (
129 char* chars, int charCount, byte* bytes, int byteCount)
133 EncoderFallbackBuffer buffer = null;
135 // Convert the characters into their byte form.
136 int posn = byteIndex;
138 int byteLength = byteCount;
141 byte *cjkToJis = JISConvert.Convert.cjkToJis;
142 byte *greekToJis = JISConvert.Convert.greekToJis;
143 byte *extraToJis = JISConvert.Convert.extraToJis;
145 byte[] cjkToJis = JISConvert.Convert.cjkToJis;
146 byte[] greekToJis = JISConvert.Convert.greekToJis;
147 byte[] extraToJis = JISConvert.Convert.extraToJis;
149 for (int i = charIndex; i < end; i++, charCount--)
152 if(posn >= byteLength)
154 throw new ArgumentException
155 (Strings.GetString("Arg_InsufficientSpace"),
160 // Character maps to itself.
161 bytes[posn++] = (byte)ch;
166 // Check for special Latin 1 characters that
167 // can be mapped to double-byte code points.
168 if(ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
169 ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
170 ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
171 ch == 0x00D7 || ch == 0x00F7)
173 if((posn + 1) >= byteLength)
175 throw new ArgumentException
177 ("Arg_InsufficientSpace"), "bytes");
182 bytes[posn++] = (byte)0x81;
183 bytes[posn++] = (byte)0x91;
187 bytes[posn++] = (byte)0x81;
188 bytes[posn++] = (byte)0x92;
192 bytes[posn++] = (byte)0x81;
193 bytes[posn++] = (byte)0x98;
197 bytes[posn++] = (byte)0x81;
198 bytes[posn++] = (byte)0x4E;
202 bytes[posn++] = (byte)0x81;
203 bytes[posn++] = (byte)0xCA;
207 bytes[posn++] = (byte)0x81;
208 bytes[posn++] = (byte)0x8B;
212 bytes[posn++] = (byte)0x81;
213 bytes[posn++] = (byte)0x7D;
217 bytes[posn++] = (byte)0x81;
218 bytes[posn++] = (byte)0x4C;
222 bytes[posn++] = (byte)0x81;
223 bytes[posn++] = (byte)0xF7;
227 bytes[posn++] = (byte)0x81;
228 bytes[posn++] = (byte)0x7E;
232 bytes[posn++] = (byte)0x81;
233 bytes[posn++] = (byte)0x80;
237 else if(ch == 0x00A5)
240 bytes[posn++] = (byte)0x5C;
244 HandleFallback (ref buffer,
245 chars, ref charIndex, ref charCount,
246 bytes, ref posn, ref byteCount, null);
250 else if(ch >= 0x0391 && ch <= 0x0451)
252 // Greek subset characters.
253 value = (ch - 0x0391) * 2;
254 value = ((int)(greekToJis[value])) |
255 (((int)(greekToJis[value + 1])) << 8);
257 else if(ch >= 0x2010 && ch <= 0x9FA5)
259 // This range contains the bulk of the CJK set.
260 value = (ch - 0x2010) * 2;
261 value = ((int)(cjkToJis[value])) |
262 (((int)(cjkToJis[value + 1])) << 8);
264 else if(ch >= 0xE000 && ch <= 0xE757)
267 int diff = ch - 0xE000;
268 value = ((int) (diff / 0xBC) << 8)
271 if (value % 0x100 >= 0x7F)
274 else if(ch >= 0xFF01 && ch <= 0xFF60)
276 value = (ch - 0xFF01) * 2;
277 value = ((int)(extraToJis[value])) |
278 (((int)(extraToJis[value + 1])) << 8);
280 else if(ch >= 0xFF60 && ch <= 0xFFA0)
282 value = ch - 0xFF60 + 0xA0;
286 // Invalid character.
291 HandleFallback (ref buffer,
292 chars, ref charIndex, ref charCount,
293 bytes, ref posn, ref byteCount, null);
295 else if(value < 0x0100)
297 bytes[posn++] = (byte)value;
299 else if((posn + 1) >= byteLength)
301 throw new ArgumentException
302 (Strings.GetString("Arg_InsufficientSpace"),
305 else if(value < 0x8000)
307 // JIS X 0208 character.
310 value = (value % 0xBC) + 0x40;
315 if(ch < (0x9F - 0x80))
317 bytes[posn++] = (byte)(ch + 0x81);
321 bytes[posn++] = (byte)(ch - (0x9F - 0x80) + 0xE0);
323 bytes[posn++] = (byte)value;
325 else if (value >= 0xF040 && value <= 0xF9FC)
328 bytes[posn++] = (byte) (value / 0x100);
329 bytes[posn++] = (byte) (value % 0x100);
333 // JIS X 0212 character, which Shift-JIS doesn't
334 // support, but we've already allocated two slots.
335 bytes[posn++] = (byte)'?';
336 bytes[posn++] = (byte)'?';
340 // Return the final length to the caller.
341 return posn - byteIndex;
344 // Get the number of bytes needed to encode a character buffer.
345 public override int GetByteCount(char[] chars, int index, int count)
347 // Determine the length of the final output.
350 byte[] cjkToJis = JISConvert.Convert.cjkToJis;
351 byte[] extraToJis = JISConvert.Convert.extraToJis;
360 // Character maps to itself.
363 else if (ch < 0x0100)
365 // Check for special Latin 1 characters that
366 // can be mapped to double-byte code points.
367 if (ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
368 ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
369 ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
370 ch == 0x00D7 || ch == 0x00F7)
375 else if (ch >= 0x0391 && ch <= 0x0451)
377 // Greek subset characters.
380 else if (ch >= 0x2010 && ch <= 0x9FA5)
382 // This range contains the bulk of the CJK set.
383 value = (ch - 0x2010) * 2;
384 value = ((int)(cjkToJis[value])) |
385 (((int)(cjkToJis[value + 1])) << 8);
391 else if (ch >= 0xE000 && ch <= 0xE757)
394 else if (ch >= 0xFF01 && ch <= 0xFFEF)
396 // This range contains extra characters,
397 // including half-width katakana.
398 value = (ch - 0xFF01) * 2;
399 value = ((int)(extraToJis[value])) |
400 (((int)(extraToJis[value + 1])) << 8);
408 // Return the length to the caller.
412 // Get the bytes that result from encoding a character buffer.
413 public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)
415 int byteCount = bytes.Length;
416 EncoderFallbackBuffer buffer = null;
418 // Convert the characters into their byte form.
419 int posn = byteIndex;
420 int end = charIndex + charCount;
421 int byteLength = byteCount;
423 byte[] cjkToJis = JISConvert.Convert.cjkToJis;
424 byte[] greekToJis = JISConvert.Convert.greekToJis;
425 byte[] extraToJis = JISConvert.Convert.extraToJis;
427 for (int i = charIndex; i < end; i++, charCount--)
431 if (posn >= byteLength)
433 throw new ArgumentException
434 (Strings.GetString("Arg_InsufficientSpace"),
439 // Character maps to itself.
440 bytes[posn++] = (byte)ch;
443 else if (ch < 0x0100)
445 // Check for special Latin 1 characters that
446 // can be mapped to double-byte code points.
447 if (ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
448 ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
449 ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
450 ch == 0x00D7 || ch == 0x00F7)
452 if ((posn + 1) >= byteLength)
454 throw new ArgumentException
456 ("Arg_InsufficientSpace"), "bytes");
461 bytes[posn++] = (byte)0x81;
462 bytes[posn++] = (byte)0x91;
466 bytes[posn++] = (byte)0x81;
467 bytes[posn++] = (byte)0x92;
471 bytes[posn++] = (byte)0x81;
472 bytes[posn++] = (byte)0x98;
476 bytes[posn++] = (byte)0x81;
477 bytes[posn++] = (byte)0x4E;
481 bytes[posn++] = (byte)0x81;
482 bytes[posn++] = (byte)0xCA;
486 bytes[posn++] = (byte)0x81;
487 bytes[posn++] = (byte)0x8B;
491 bytes[posn++] = (byte)0x81;
492 bytes[posn++] = (byte)0x7D;
496 bytes[posn++] = (byte)0x81;
497 bytes[posn++] = (byte)0x4C;
501 bytes[posn++] = (byte)0x81;
502 bytes[posn++] = (byte)0xF7;
506 bytes[posn++] = (byte)0x81;
507 bytes[posn++] = (byte)0x7E;
511 bytes[posn++] = (byte)0x81;
512 bytes[posn++] = (byte)0x80;
516 else if (ch == 0x00A5)
519 bytes[posn++] = (byte)0x5C;
523 HandleFallback (ref buffer, chars, ref i, ref charCount, bytes,
524 ref byteIndex, ref byteCount, null);
528 else if (ch >= 0x0391 && ch <= 0x0451)
530 // Greek subset characters.
531 value = (ch - 0x0391) * 2;
532 value = ((int)(greekToJis[value])) |
533 (((int)(greekToJis[value + 1])) << 8);
535 else if (ch >= 0x2010 && ch <= 0x9FA5)
537 // This range contains the bulk of the CJK set.
538 value = (ch - 0x2010) * 2;
539 value = ((int)(cjkToJis[value])) |
540 (((int)(cjkToJis[value + 1])) << 8);
542 else if (ch >= 0xE000 && ch <= 0xE757)
545 int diff = ch - 0xE000;
546 value = ((int)(diff / 0xBC) << 8)
549 if (value % 0x100 >= 0x7F)
552 else if (ch >= 0xFF01 && ch <= 0xFF60)
554 value = (ch - 0xFF01) * 2;
555 value = ((int)(extraToJis[value])) |
556 (((int)(extraToJis[value + 1])) << 8);
558 else if (ch >= 0xFF60 && ch <= 0xFFA0)
560 value = ch - 0xFF60 + 0xA0;
564 // Invalid character.
569 HandleFallback (ref buffer, chars, ref charIndex, ref charCount,
570 bytes, ref posn, ref byteCount, null);
572 else if (value < 0x0100)
574 bytes[posn++] = (byte)value;
576 else if ((posn + 1) >= byteLength)
578 throw new ArgumentException
579 (Strings.GetString("Arg_InsufficientSpace"),
582 else if (value < 0x8000)
584 // JIS X 0208 character.
587 value = (value % 0xBC) + 0x40;
592 if (ch < (0x9F - 0x80))
594 bytes[posn++] = (byte)(ch + 0x81);
598 bytes[posn++] = (byte)(ch - (0x9F - 0x80) + 0xE0);
600 bytes[posn++] = (byte)value;
602 else if (value >= 0xF040 && value <= 0xF9FC)
605 bytes[posn++] = (byte)(value / 0x100);
606 bytes[posn++] = (byte)(value % 0x100);
610 // JIS X 0212 character, which Shift-JIS doesn't
611 // support, but we've already allocated two slots.
612 bytes[posn++] = (byte)'?';
613 bytes[posn++] = (byte)'?';
617 // Return the final length to the caller.
618 return posn - byteIndex;
622 public override int GetCharCount (byte [] bytes, int index, int count)
624 return new CP932Decoder (JISConvert.Convert).GetCharCount (
625 bytes, index, count, true);
628 public override int GetChars (
629 byte [] bytes, int byteIndex, int byteCount,
630 char [] chars, int charIndex)
632 return new CP932Decoder (JISConvert.Convert).GetChars (bytes,
633 byteIndex, byteCount, chars, charIndex,
637 // Get the maximum number of bytes needed to encode a
638 // specified number of characters.
639 public override int GetMaxByteCount(int charCount)
643 throw new ArgumentOutOfRangeException
645 Strings.GetString("ArgRange_NonNegative"));
647 return charCount * 2;
650 // Get the maximum number of characters needed to decode a
651 // specified number of bytes.
652 public override int GetMaxCharCount(int byteCount)
656 throw new ArgumentOutOfRangeException
658 Strings.GetString("ArgRange_NonNegative"));
663 // Get a decoder that handles a rolling Shift-JIS state.
664 public override Decoder GetDecoder()
666 return new CP932Decoder(JISConvert.Convert);
671 // Get the mail body name for this encoding.
672 public override String BodyName {
673 get { return "iso-2022-jp"; }
676 // Get the human-readable name for this encoding.
677 public override String EncodingName {
678 get { return "Japanese (Shift-JIS)"; }
681 // Get the mail agent header name for this encoding.
682 public override String HeaderName {
683 get { return "iso-2022-jp"; }
686 // Determine if this encoding can be displayed in a Web browser.
687 public override bool IsBrowserDisplay {
691 // Determine if this encoding can be saved from a Web browser.
692 public override bool IsBrowserSave {
696 // Determine if this encoding can be displayed in a mail/news agent.
697 public override bool IsMailNewsDisplay {
701 // Determine if this encoding can be saved from a mail/news agent.
702 public override bool IsMailNewsSave {
706 // Get the IANA-preferred Web name for this encoding.
707 public override String WebName {
708 get { return "shift_jis"; }
711 // Get the Windows code page represented by this object.
712 public override int WindowsCodePage {
713 get { return SHIFTJIS_CODE_PAGE; }
716 // FIXME: This doesn't make sense, but without declaring this override
717 // System.XML regresses at Encoder.Convert() in
718 // MonoTests.System.Xml.XmlWriterSettingsTests.EncodingTest.
719 public override Encoder GetEncoder ()
721 return new MonoEncodingDefaultEncoder (this);
726 #endif // !ECMA_COMPAT
728 // Decoder that handles a rolling Shift-JIS state.
729 sealed class CP932Decoder : DbcsEncoding.DbcsDecoder
731 private new JISConvert convert;
732 private int last_byte_count;
733 private int last_byte_chars;
736 public CP932Decoder(JISConvert convert)
739 this.convert = convert;
742 // Override inherited methods.
744 public override int GetCharCount (
745 byte [] bytes, int index, int count)
747 return GetCharCount (bytes, index, count, false);
752 int GetCharCount (byte [] bytes, int index, int count, bool refresh)
754 CheckRange (bytes, index, count);
756 // Determine the total length of the converted string.
759 int last = last_byte_count;
762 byteval = bytes[index++];
766 if((byteval >= 0x81 && byteval <= 0x9F) ||
767 (byteval >= 0xE0 && byteval <= 0xEF))
769 // First byte in a double-byte sequence.
776 // Second byte in a double-byte sequence.
783 last_byte_count = '\0';
786 last_byte_count = last;
788 // Return the total length.
792 public override int GetChars (
793 byte [] bytes, int byteIndex, int byteCount,
794 char [] chars, int charIndex)
796 return GetChars (bytes, byteIndex, byteCount,
797 chars, charIndex, false);
803 byte [] bytes, int byteIndex, int byteCount,
804 char [] chars, int charIndex, bool refresh)
806 CheckRange (bytes, byteIndex, byteCount,
809 // Decode the bytes in the buffer.
810 int posn = charIndex;
811 int charLength = chars.Length;
813 int last = last_byte_chars;
815 byte *table = convert.jisx0208ToUnicode;
817 byte[] table = convert.jisx0208ToUnicode;
821 byteval = bytes[byteIndex++];
825 if(posn >= charLength)
827 throw new ArgumentException
829 ("Arg_InsufficientSpace"), "chars");
831 if((byteval >= 0x81 && byteval <= 0x9F) ||
832 (byteval >= 0xE0 && byteval <= 0xEF))
834 // First byte in a double-byte sequence.
837 else if(byteval < 0x80)
839 // Ordinary ASCII/Latin1 character.
840 chars[posn++] = (char)byteval;
842 else if(byteval >= 0xA1 && byteval <= 0xDF)
844 // Half-width katakana character.
845 chars[posn++] = (char)(byteval - 0xA1 + 0xFF61);
849 // Invalid first byte.
855 // Second byte in a double-byte sequence.
856 if(last >= 0x81 && last <= 0x9F)
858 value = (last - 0x81) * 0xBC;
860 else if (last >= 0xF0 && last <= 0xFC && byteval <= 0xFC)
863 value = 0xE000 + (last - 0xF0) * 0xBC + byteval;
869 value = (last - 0xE0 + (0xA0 - 0x81)) * 0xBC;
872 if(byteval >= 0x40 && byteval <= 0x7E)
874 value += (byteval - 0x40);
876 else if(byteval >= 0x80 && byteval <= 0xFC)
878 value += (byteval - 0x80 + 0x3F);
882 // Invalid second byte.
887 value = ((int)(table[value])) |
888 (((int)(table[value + 1])) << 8);
891 chars[posn++] = (char)value;
901 chars[posn++] = '\u30FB';
902 last_byte_chars = '\0';
905 last_byte_chars = last;
907 // Return the final length to the caller.
908 return posn - charIndex;
911 } // class CP932Decoder
914 public class ENCshift_jis : CP932
916 public ENCshift_jis() : base() {}
918 }; // class ENCshift_jis
920 }; // namespace I18N.CJK