2 * CP932.cs - Japanese (Shift-JIS) code page.
4 * Copyright (c) 2002 Southern Storm Software, Pty Ltd
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the "Software"),
8 * to deal in the Software without restriction, including without limitation
9 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 * and/or sell copies of the Software, and to permit persons to whom the
11 * Software is furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included
14 * in all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
20 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
21 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
22 * OTHER DEALINGS IN THE SOFTWARE.
26 // Copyright (C) 2005-2006 Novell, Inc.
37 using MonoEncoder = I18N.Common.MonoSafeEncoder;
38 using MonoEncoding = I18N.Common.MonoSafeEncoding;
42 public class CP932 : MonoEncoding
44 // Magic number used by Windows for the Shift-JIS code page.
45 private const int SHIFTJIS_CODE_PAGE = 932;
48 public CP932() : base(SHIFTJIS_CODE_PAGE)
53 // Get the number of bytes needed to encode a character buffer.
54 public unsafe override int GetByteCountImpl (char* chars, int count)
58 // Determine the length of the final output.
62 byte *cjkToJis = JISConvert.Convert.cjkToJis;
63 byte *extraToJis = JISConvert.Convert.extraToJis;
65 byte[] cjkToJis = JISConvert.Convert.cjkToJis;
66 byte[] extraToJis = JISConvert.Convert.extraToJis;
75 // Character maps to itself.
80 // Check for special Latin 1 characters that
81 // can be mapped to double-byte code points.
82 if(ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
83 ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
84 ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
85 ch == 0x00D7 || ch == 0x00F7)
90 else if(ch >= 0x0391 && ch <= 0x0451)
92 // Greek subset characters.
95 else if(ch >= 0x2010 && ch <= 0x9FA5)
97 // This range contains the bulk of the CJK set.
98 value = (ch - 0x2010) * 2;
99 value = ((int)(cjkToJis[value])) |
100 (((int)(cjkToJis[value + 1])) << 8);
106 else if(ch >= 0xE000 && ch <= 0xE757)
109 else if(ch >= 0xFF01 && ch <= 0xFFEF)
111 // This range contains extra characters,
112 // including half-width katakana.
113 value = (ch - 0xFF01) * 2;
114 value = ((int)(extraToJis[value])) |
115 (((int)(extraToJis[value + 1])) << 8);
123 // Return the length to the caller.
127 // Get the bytes that result from encoding a character buffer.
128 public unsafe override int GetBytesImpl (
129 char* chars, int charCount, byte* bytes, int byteCount)
134 EncoderFallbackBuffer buffer = null;
137 // Convert the characters into their byte form.
138 int posn = byteIndex;
140 int byteLength = byteCount;
143 byte *cjkToJis = JISConvert.Convert.cjkToJis;
144 byte *greekToJis = JISConvert.Convert.greekToJis;
145 byte *extraToJis = JISConvert.Convert.extraToJis;
147 byte[] cjkToJis = JISConvert.Convert.cjkToJis;
148 byte[] greekToJis = JISConvert.Convert.greekToJis;
149 byte[] extraToJis = JISConvert.Convert.extraToJis;
151 for (int i = charIndex; i < end; i++, charCount--)
154 if(posn >= byteLength)
156 throw new ArgumentException
157 (Strings.GetString("Arg_InsufficientSpace"),
162 // Character maps to itself.
163 bytes[posn++] = (byte)ch;
168 // Check for special Latin 1 characters that
169 // can be mapped to double-byte code points.
170 if(ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
171 ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
172 ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
173 ch == 0x00D7 || ch == 0x00F7)
175 if((posn + 1) >= byteLength)
177 throw new ArgumentException
179 ("Arg_InsufficientSpace"), "bytes");
184 bytes[posn++] = (byte)0x81;
185 bytes[posn++] = (byte)0x91;
189 bytes[posn++] = (byte)0x81;
190 bytes[posn++] = (byte)0x92;
194 bytes[posn++] = (byte)0x81;
195 bytes[posn++] = (byte)0x98;
199 bytes[posn++] = (byte)0x81;
200 bytes[posn++] = (byte)0x4E;
204 bytes[posn++] = (byte)0x81;
205 bytes[posn++] = (byte)0xCA;
209 bytes[posn++] = (byte)0x81;
210 bytes[posn++] = (byte)0x8B;
214 bytes[posn++] = (byte)0x81;
215 bytes[posn++] = (byte)0x7D;
219 bytes[posn++] = (byte)0x81;
220 bytes[posn++] = (byte)0x4C;
224 bytes[posn++] = (byte)0x81;
225 bytes[posn++] = (byte)0xF7;
229 bytes[posn++] = (byte)0x81;
230 bytes[posn++] = (byte)0x7E;
234 bytes[posn++] = (byte)0x81;
235 bytes[posn++] = (byte)0x80;
239 else if(ch == 0x00A5)
242 bytes[posn++] = (byte)0x5C;
247 HandleFallback (ref buffer,
248 chars, ref charIndex, ref charCount,
249 bytes, ref posn, ref byteCount, null);
251 // Invalid character.
252 bytes[posn++] = (byte)'?';
257 else if(ch >= 0x0391 && ch <= 0x0451)
259 // Greek subset characters.
260 value = (ch - 0x0391) * 2;
261 value = ((int)(greekToJis[value])) |
262 (((int)(greekToJis[value + 1])) << 8);
264 else if(ch >= 0x2010 && ch <= 0x9FA5)
266 // This range contains the bulk of the CJK set.
267 value = (ch - 0x2010) * 2;
268 value = ((int)(cjkToJis[value])) |
269 (((int)(cjkToJis[value + 1])) << 8);
271 else if(ch >= 0xE000 && ch <= 0xE757)
274 int diff = ch - 0xE000;
275 value = ((int) (diff / 0xBC) << 8)
278 if (value % 0x100 >= 0x7F)
281 else if(ch >= 0xFF01 && ch <= 0xFF60)
283 value = (ch - 0xFF01) * 2;
284 value = ((int)(extraToJis[value])) |
285 (((int)(extraToJis[value + 1])) << 8);
287 else if(ch >= 0xFF60 && ch <= 0xFFA0)
289 value = ch - 0xFF60 + 0xA0;
293 // Invalid character.
299 HandleFallback (ref buffer,
300 chars, ref charIndex, ref charCount,
301 bytes, ref posn, ref byteCount, null);
303 bytes[posn++] = (byte)'?';
306 else if(value < 0x0100)
308 bytes[posn++] = (byte)value;
310 else if((posn + 1) >= byteLength)
312 throw new ArgumentException
313 (Strings.GetString("Arg_InsufficientSpace"),
316 else if(value < 0x8000)
318 // JIS X 0208 character.
321 value = (value % 0xBC) + 0x40;
326 if(ch < (0x9F - 0x80))
328 bytes[posn++] = (byte)(ch + 0x81);
332 bytes[posn++] = (byte)(ch - (0x9F - 0x80) + 0xE0);
334 bytes[posn++] = (byte)value;
336 else if (value >= 0xF040 && value <= 0xF9FC)
339 bytes[posn++] = (byte) (value / 0x100);
340 bytes[posn++] = (byte) (value % 0x100);
344 // JIS X 0212 character, which Shift-JIS doesn't
345 // support, but we've already allocated two slots.
346 bytes[posn++] = (byte)'?';
347 bytes[posn++] = (byte)'?';
351 // Return the final length to the caller.
352 return posn - byteIndex;
355 // Get the number of bytes needed to encode a character buffer.
356 public override int GetByteCount(char[] chars, int index, int count)
358 // Determine the length of the final output.
361 byte[] cjkToJis = JISConvert.Convert.cjkToJis;
362 byte[] extraToJis = JISConvert.Convert.extraToJis;
371 // Character maps to itself.
374 else if (ch < 0x0100)
376 // Check for special Latin 1 characters that
377 // can be mapped to double-byte code points.
378 if (ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
379 ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
380 ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
381 ch == 0x00D7 || ch == 0x00F7)
386 else if (ch >= 0x0391 && ch <= 0x0451)
388 // Greek subset characters.
391 else if (ch >= 0x2010 && ch <= 0x9FA5)
393 // This range contains the bulk of the CJK set.
394 value = (ch - 0x2010) * 2;
395 value = ((int)(cjkToJis[value])) |
396 (((int)(cjkToJis[value + 1])) << 8);
402 else if (ch >= 0xE000 && ch <= 0xE757)
405 else if (ch >= 0xFF01 && ch <= 0xFFEF)
407 // This range contains extra characters,
408 // including half-width katakana.
409 value = (ch - 0xFF01) * 2;
410 value = ((int)(extraToJis[value])) |
411 (((int)(extraToJis[value + 1])) << 8);
419 // Return the length to the caller.
423 // Get the bytes that result from encoding a character buffer.
424 public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)
426 int byteCount = bytes.Length;
428 EncoderFallbackBuffer buffer = null;
431 // Convert the characters into their byte form.
432 int posn = byteIndex;
433 int end = charIndex + charCount;
434 int byteLength = byteCount;
436 byte[] cjkToJis = JISConvert.Convert.cjkToJis;
437 byte[] greekToJis = JISConvert.Convert.greekToJis;
438 byte[] extraToJis = JISConvert.Convert.extraToJis;
440 for (int i = charIndex; i < end; i++, charCount--)
444 if (posn >= byteLength)
446 throw new ArgumentException
447 (Strings.GetString("Arg_InsufficientSpace"),
452 // Character maps to itself.
453 bytes[posn++] = (byte)ch;
456 else if (ch < 0x0100)
458 // Check for special Latin 1 characters that
459 // can be mapped to double-byte code points.
460 if (ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
461 ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
462 ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
463 ch == 0x00D7 || ch == 0x00F7)
465 if ((posn + 1) >= byteLength)
467 throw new ArgumentException
469 ("Arg_InsufficientSpace"), "bytes");
474 bytes[posn++] = (byte)0x81;
475 bytes[posn++] = (byte)0x91;
479 bytes[posn++] = (byte)0x81;
480 bytes[posn++] = (byte)0x92;
484 bytes[posn++] = (byte)0x81;
485 bytes[posn++] = (byte)0x98;
489 bytes[posn++] = (byte)0x81;
490 bytes[posn++] = (byte)0x4E;
494 bytes[posn++] = (byte)0x81;
495 bytes[posn++] = (byte)0xCA;
499 bytes[posn++] = (byte)0x81;
500 bytes[posn++] = (byte)0x8B;
504 bytes[posn++] = (byte)0x81;
505 bytes[posn++] = (byte)0x7D;
509 bytes[posn++] = (byte)0x81;
510 bytes[posn++] = (byte)0x4C;
514 bytes[posn++] = (byte)0x81;
515 bytes[posn++] = (byte)0xF7;
519 bytes[posn++] = (byte)0x81;
520 bytes[posn++] = (byte)0x7E;
524 bytes[posn++] = (byte)0x81;
525 bytes[posn++] = (byte)0x80;
529 else if (ch == 0x00A5)
532 bytes[posn++] = (byte)0x5C;
537 HandleFallback (ref buffer, chars, ref i, ref charCount, bytes,
538 ref byteIndex, ref byteCount, null);
540 // Invalid character.
541 bytes[posn++] = (byte)'?';
546 else if (ch >= 0x0391 && ch <= 0x0451)
548 // Greek subset characters.
549 value = (ch - 0x0391) * 2;
550 value = ((int)(greekToJis[value])) |
551 (((int)(greekToJis[value + 1])) << 8);
553 else if (ch >= 0x2010 && ch <= 0x9FA5)
555 // This range contains the bulk of the CJK set.
556 value = (ch - 0x2010) * 2;
557 value = ((int)(cjkToJis[value])) |
558 (((int)(cjkToJis[value + 1])) << 8);
560 else if (ch >= 0xE000 && ch <= 0xE757)
563 int diff = ch - 0xE000;
564 value = ((int)(diff / 0xBC) << 8)
567 if (value % 0x100 >= 0x7F)
570 else if (ch >= 0xFF01 && ch <= 0xFF60)
572 value = (ch - 0xFF01) * 2;
573 value = ((int)(extraToJis[value])) |
574 (((int)(extraToJis[value + 1])) << 8);
576 else if (ch >= 0xFF60 && ch <= 0xFFA0)
578 value = ch - 0xFF60 + 0xA0;
582 // Invalid character.
588 HandleFallback (ref buffer, chars, ref charIndex, ref charCount,
589 bytes, ref posn, ref byteCount, null);
591 bytes[posn++] = (byte)'?';
594 else if (value < 0x0100)
596 bytes[posn++] = (byte)value;
598 else if ((posn + 1) >= byteLength)
600 throw new ArgumentException
601 (Strings.GetString("Arg_InsufficientSpace"),
604 else if (value < 0x8000)
606 // JIS X 0208 character.
609 value = (value % 0xBC) + 0x40;
614 if (ch < (0x9F - 0x80))
616 bytes[posn++] = (byte)(ch + 0x81);
620 bytes[posn++] = (byte)(ch - (0x9F - 0x80) + 0xE0);
622 bytes[posn++] = (byte)value;
624 else if (value >= 0xF040 && value <= 0xF9FC)
627 bytes[posn++] = (byte)(value / 0x100);
628 bytes[posn++] = (byte)(value % 0x100);
632 // JIS X 0212 character, which Shift-JIS doesn't
633 // support, but we've already allocated two slots.
634 bytes[posn++] = (byte)'?';
635 bytes[posn++] = (byte)'?';
639 // Return the final length to the caller.
640 return posn - byteIndex;
644 public override int GetCharCount (byte [] bytes, int index, int count)
646 return new CP932Decoder (JISConvert.Convert).GetCharCount (
647 bytes, index, count, true);
650 public override int GetChars (
651 byte [] bytes, int byteIndex, int byteCount,
652 char [] chars, int charIndex)
654 return new CP932Decoder (JISConvert.Convert).GetChars (bytes,
655 byteIndex, byteCount, chars, charIndex,
659 // Get the maximum number of bytes needed to encode a
660 // specified number of characters.
661 public override int GetMaxByteCount(int charCount)
665 throw new ArgumentOutOfRangeException
667 Strings.GetString("ArgRange_NonNegative"));
669 return charCount * 2;
672 // Get the maximum number of characters needed to decode a
673 // specified number of bytes.
674 public override int GetMaxCharCount(int byteCount)
678 throw new ArgumentOutOfRangeException
680 Strings.GetString("ArgRange_NonNegative"));
685 // Get a decoder that handles a rolling Shift-JIS state.
686 public override Decoder GetDecoder()
688 return new CP932Decoder(JISConvert.Convert);
693 // Get the mail body name for this encoding.
694 public override String BodyName {
695 get { return "iso-2022-jp"; }
698 // Get the human-readable name for this encoding.
699 public override String EncodingName {
700 get { return "Japanese (Shift-JIS)"; }
703 // Get the mail agent header name for this encoding.
704 public override String HeaderName {
705 get { return "iso-2022-jp"; }
708 // Determine if this encoding can be displayed in a Web browser.
709 public override bool IsBrowserDisplay {
713 // Determine if this encoding can be saved from a Web browser.
714 public override bool IsBrowserSave {
718 // Determine if this encoding can be displayed in a mail/news agent.
719 public override bool IsMailNewsDisplay {
723 // Determine if this encoding can be saved from a mail/news agent.
724 public override bool IsMailNewsSave {
728 // Get the IANA-preferred Web name for this encoding.
729 public override String WebName {
730 get { return "shift_jis"; }
733 // Get the Windows code page represented by this object.
734 public override int WindowsCodePage {
735 get { return SHIFTJIS_CODE_PAGE; }
740 #endif // !ECMA_COMPAT
742 // Decoder that handles a rolling Shift-JIS state.
743 sealed class CP932Decoder : DbcsEncoding.DbcsDecoder
745 private new JISConvert convert;
746 private int last_byte_count;
747 private int last_byte_chars;
750 public CP932Decoder(JISConvert convert)
753 this.convert = convert;
756 // Override inherited methods.
758 public override int GetCharCount (
759 byte [] bytes, int index, int count)
761 return GetCharCount (bytes, index, count, false);
768 int GetCharCount (byte [] bytes, int index, int count, bool refresh)
770 CheckRange (bytes, index, count);
772 // Determine the total length of the converted string.
775 int last = last_byte_count;
778 byteval = bytes[index++];
782 if((byteval >= 0x81 && byteval <= 0x9F) ||
783 (byteval >= 0xE0 && byteval <= 0xEF))
785 // First byte in a double-byte sequence.
792 // Second byte in a double-byte sequence.
799 last_byte_count = '\0';
802 last_byte_count = last;
804 // Return the total length.
808 public override int GetChars (
809 byte [] bytes, int byteIndex, int byteCount,
810 char [] chars, int charIndex)
812 return GetChars (bytes, byteIndex, byteCount,
813 chars, charIndex, false);
821 byte [] bytes, int byteIndex, int byteCount,
822 char [] chars, int charIndex, bool refresh)
824 CheckRange (bytes, byteIndex, byteCount,
827 // Decode the bytes in the buffer.
828 int posn = charIndex;
829 int charLength = chars.Length;
831 int last = last_byte_chars;
833 byte *table = convert.jisx0208ToUnicode;
835 byte[] table = convert.jisx0208ToUnicode;
839 byteval = bytes[byteIndex++];
843 if(posn >= charLength)
845 throw new ArgumentException
847 ("Arg_InsufficientSpace"), "chars");
849 if((byteval >= 0x81 && byteval <= 0x9F) ||
850 (byteval >= 0xE0 && byteval <= 0xEF))
852 // First byte in a double-byte sequence.
855 else if(byteval < 0x80)
857 // Ordinary ASCII/Latin1 character.
858 chars[posn++] = (char)byteval;
860 else if(byteval >= 0xA1 && byteval <= 0xDF)
862 // Half-width katakana character.
863 chars[posn++] = (char)(byteval - 0xA1 + 0xFF61);
867 // Invalid first byte.
873 // Second byte in a double-byte sequence.
874 if(last >= 0x81 && last <= 0x9F)
876 value = (last - 0x81) * 0xBC;
878 else if (last >= 0xF0 && last <= 0xFC && byteval <= 0xFC)
881 value = 0xE000 + (last - 0xF0) * 0xBC + byteval;
887 value = (last - 0xE0 + (0xA0 - 0x81)) * 0xBC;
890 if(byteval >= 0x40 && byteval <= 0x7E)
892 value += (byteval - 0x40);
894 else if(byteval >= 0x80 && byteval <= 0xFC)
896 value += (byteval - 0x80 + 0x3F);
900 // Invalid second byte.
905 value = ((int)(table[value])) |
906 (((int)(table[value + 1])) << 8);
909 chars[posn++] = (char)value;
919 chars[posn++] = '\u30FB';
920 last_byte_chars = '\0';
923 last_byte_chars = last;
925 // Return the final length to the caller.
926 return posn - charIndex;
929 } // class CP932Decoder
932 public class ENCshift_jis : CP932
934 public ENCshift_jis() : base() {}
936 }; // class ENCshift_jis
938 }; // namespace I18N.CJK