3 // Copyright (c) Microsoft Corporation. All rights reserved.
9 // Ported to managed code from c_is2022.c and related iso 2022 dll files from mlang
13 // Managed implimentation of ISO 2022 code pages, ported from the implimentation in c_is2022.dll
14 // This code should be kept in [....] with the other implimentations
15 // This encoding wraps the basic encodings in code that adds the shift in/out wrapper methods
19 // IsAlwaysNormalized ???
20 // Regarding Normalization for ISO-2022-JP (50220, 50221, 50222), its the same rules as EUCJP
21 // Forms KC & KD are precluded because of things like halfwidth Katakana that has compatibility mappings
22 // Form D is precluded because of 0x00a8, which changes to space + dierises.
24 // Note: I think that IsAlwaysNormalized should probably return true for form C for Japanese 20932 based CPs.
27 // Never normalized, C & D (& therefore KC & KD) are precluded because of Hangul syllables and combined characters.
29 // IsAlwaysNormalized ???
30 // Regarding Normalization for ISO-2022-CN (50227, 50229) & HZ-GB2312 (52936) I think is similar to the Japanese case.
31 // Forms KC & KD are precluded because of things like halfwidth Katakana that has compatibility mappings
32 // Form D is precluded because of 0x00a8, which changes to space + dierises.
34 // Note: I think that IsAlwaysNormalized should probably return true for form C for Chinese 20936 based CPs.
36 #if FEATURE_CODEPAGES_FILE // requires BaseCodePageEncooding
39 using System.Globalization;
40 using System.Diagnostics.Contracts;
42 using System.Runtime.InteropServices;
44 using System.Security;
45 using System.Runtime.CompilerServices;
46 using System.Runtime.Serialization;
49 /*=================================ISO2022Encoding============================
51 ** This is used to support ISO 2022 encodings that use shift/escape sequences.
53 ==============================================================================*/
56 internal class ISO2022Encoding : DBCSCodePageEncoding
58 const byte SHIFT_OUT = (byte)0x0E;
59 const byte SHIFT_IN = (byte)0x0F;
60 const byte ESCAPE = 0x1B;
61 const byte LEADBYTE_HALFWIDTH = 0x10;
63 // We have to load the 936 code page tables, so impersonate 936 as our base
64 // This pretends to be other code pages as far as memory sections are concerned.
65 [System.Security.SecurityCritical] // auto-generated
66 internal ISO2022Encoding(int codePage) : base(codePage, tableBaseCodePages[codePage % 10])
68 this.m_bUseMlangTypeForSerialization = true;
71 // Constructor called by serialization.
72 // Note: We use the base GetObjectData however
73 [System.Security.SecurityCritical] // auto-generated
74 internal ISO2022Encoding(SerializationInfo info, StreamingContext context) : base(info, context)
76 // Actually this can't ever get called, CodePageEncoding is our proxy
77 Contract.Assert(false, "Didn't expect to make it to DBCSCodePageEncoding serialization constructor");
78 throw new ArgumentException(Environment.GetResourceString("Arg_ExecutionEngineException"));
81 static int[] tableBaseCodePages =
83 932, // 50220 ISO-2022-JP, No halfwidth Katakana, convert to full width
84 932, // 50221 ISO-2022-JP, Use escape sequence for half width Katakana
85 932, // 50222 ISO-2022-JP, Use shift-in/shift-out for half width Katakana
88 949, // 50225 ISO-2022-KR, Korean
89 936, // 52936 HZ-GB2312, 936 might be better source
90 0, //20936, // 50227 ISO-2022-CN, Note: This is just the same as CP 936 in Everett.
92 // 50229 is currently unsupported, CP 20000 is currently not built in .nlp file
93 0, //20000, // 50229 ISO-2022-CN, ModeCNS11643_1
94 0, //20000, // 50229 ISO-2022-CN, ModeCNS11643_2
98 internal enum ISO2022Modes
100 ModeHalfwidthKatakana = 0,
109 ModeIncompleteEscape = -1,
110 ModeInvalidEscape = -2,
114 [System.Security.SecurityCritical] // auto-generated
115 protected unsafe override String GetMemorySectionName()
117 int iUseCodePage = this.bFlagDataTable ? dataTableCodePage : CodePage;
121 switch (this.CodePage)
126 strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}_ISO2022JP";
129 strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}_ISO2022KR";
132 strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}_HZ";
135 Contract.Assert(false, "[ISO2022Encoding.GetMemorySectionName] Don't expect to get here for code page " + this.CodePage);
136 strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}";
140 String strName = String.Format(CultureInfo.InvariantCulture, strFormat,
141 iUseCodePage, this.pCodePage->VersionMajor, this.pCodePage->VersionMinor,
142 this.pCodePage->VersionRevision, this.pCodePage->VersionBuild);
147 // Clean up characters for ISO2022 code pages, etc.
148 // ISO2022 (50220, 50221, 50222)
150 protected override bool CleanUpBytes(ref int bytes)
152 switch (this.CodePage)
154 // 932 based code pages
161 // map extended char (0xfa40-0xfc4b) to a special range
162 // (ported from mlang)
163 if (bytes >= 0xfa40 && bytes <= 0xfc4b)
165 if ( bytes >= 0xfa40 && bytes <= 0xfa5b )
167 if ( bytes <= 0xfa49 )
168 bytes = bytes - 0x0b51 ;
169 else if ( bytes >= 0xfa4a && bytes <= 0xfa53 )
170 bytes = bytes - 0x072f6 ;
171 else if ( bytes >= 0xfa54 && bytes <= 0xfa57 )
172 bytes = bytes - 0x0b5b ;
173 else if ( bytes == 0xfa58 )
175 else if ( bytes == 0xfa59 )
177 else if ( bytes == 0xfa5a )
179 else if ( bytes == 0xfa5b )
182 else if ( bytes >= 0xfa5c && bytes <= 0xfc4b )
184 byte tc = unchecked((byte)bytes);
186 bytes = bytes - 0x0d5f;
187 else if ( tc >= 0x80 && tc <= 0x9B )
188 bytes = bytes - 0x0d1d;
190 bytes = bytes - 0x0d1c;
194 // Convert 932 code page to 20932 like code page range
195 // (also ported from mlang)
196 byte bLead = unchecked((byte)(bytes >> 8));
197 byte bTrail = unchecked((byte)bytes);
199 bLead -= ((bLead > (byte)0x9f) ? (byte)0xb1 : (byte)0x71);
200 bLead = (byte)((bLead << 1) + 1);
201 if (bTrail > (byte)0x9e)
203 bTrail -= (byte)0x7e;
208 if (bTrail > (byte)0x7e)
210 bTrail -= (byte)0x1f;
213 bytes = ((int)bLead) << 8 | (int)bTrail;
215 // Don't step out of our allocated lead byte area.
216 // All DBCS lead and trail bytes should be >= 0x21 and <= 0x7e
217 // This is commented out because Everett/Mlang had illegal PUA
218 // mappings to ISO2022 code pages that we're maintaining.
219 // if ((bytes & 0xFF00) < 0x2100 || (bytes & 0xFF00) > 0x7e00 ||
220 // (bytes & 0xFF) < 0x21 || (bytes & 0xFF) > 0x7e)
225 // Adjust 1/2 Katakana
226 if (bytes >= 0xa1 && bytes <= 0xdf)
227 bytes += (LEADBYTE_HALFWIDTH << 8) - 0x80;
229 // 0x81-0x9f and 0xe0-0xfc CP 932
230 // 0x8e and 0xa1-0xfe CP 20932 (we don't use 8e though)
231 // b0-df is 1/2 Katakana
234 (bytes >= 0xe0 && bytes <= 0xfc)))
236 // Don't do lead bytes, we use escape sequences instead.
244 // For 50225 since we don't rely on lead byte marks, return false and don't add them,
245 // esp. since we're only a 7 bit code page.
246 if (bytes >= 0x80 && bytes <= 0xff)
249 // Ignore characters out of range (a1-7f)
250 if (bytes >= 0x100 &&
251 ((bytes & 0xff) < 0xa1 || (bytes & 0xff) == 0xff ||
252 (bytes & 0xff00) < 0xa100 || (bytes & 0xff00) == 0xff00))
255 // May as well get them into our 7 bit range
262 // Since we don't rely on lead byte marks for 52936, get rid of them so we
263 // don't end up with extra wierd fffe mappings.
264 if (bytes >= 0x81 && bytes <= 0xfe)
275 [System.Security.SecurityCritical] // auto-generated
276 internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder)
278 // Just need to ASSERT, this is called by something else internal that checked parameters already
279 Contract.Assert(count >= 0, "[ISO2022Encoding.GetByteCount]count is negative");
280 Contract.Assert(chars != null, "[ISO2022Encoding.GetByteCount]chars is null");
282 // Just call GetBytes with null byte* to get count
283 return GetBytes(chars, count, null, 0, baseEncoder);
286 [System.Security.SecurityCritical] // auto-generated
287 internal override unsafe int GetBytes(char* chars, int charCount,
288 byte* bytes, int byteCount, EncoderNLS baseEncoder)
290 // Just need to ASSERT, this is called by something else internal that checked parameters already
291 Contract.Assert(chars != null, "[ISO2022Encoding.GetBytes]chars is null");
292 Contract.Assert(byteCount >= 0, "[ISO2022Encoding.GetBytes]byteCount is negative");
293 Contract.Assert(charCount >= 0, "[ISO2022Encoding.GetBytes]charCount is negative");
295 // Assert because we shouldn't be able to have a null encoder.
296 Contract.Assert(encoderFallback != null, "[ISO2022Encoding.GetBytes]Attempting to use null encoder fallback");
299 ISO2022Encoder encoder = (ISO2022Encoder)baseEncoder;
309 iCount = GetBytesCP5022xJP( chars, charCount, bytes, byteCount, encoder );
312 iCount = GetBytesCP50225KR( chars, charCount, bytes, byteCount, encoder );
314 // Everett had 50227 the same as 936
316 iCount = GetBytesCP50227CN( chars, charCount, bytes, byteCount, encoder );
320 iCount = GetBytesCP52936( chars, charCount, bytes, byteCount, encoder );
327 // This is internal and called by something else,
328 [System.Security.SecurityCritical] // auto-generated
329 internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)
331 // Just assert, we're called internally so these should be safe, checked already
332 Contract.Assert(bytes != null, "[ISO2022Encoding.GetCharCount]bytes is null");
333 Contract.Assert(count >= 0, "[ISO2022Encoding.GetCharCount]byteCount is negative");
335 // Just call getChars with null char* to get count
336 return GetChars(bytes, count, null, 0, baseDecoder);
339 [System.Security.SecurityCritical] // auto-generated
340 internal override unsafe int GetChars(byte* bytes, int byteCount,
341 char* chars, int charCount, DecoderNLS baseDecoder)
343 // Just need to ASSERT, this is called by something else internal that checked parameters already
344 Contract.Assert(bytes != null, "[ISO2022Encoding.GetChars]bytes is null");
345 Contract.Assert(byteCount >= 0, "[ISO2022Encoding.GetChars]byteCount is negative");
346 Contract.Assert(charCount >= 0, "[ISO2022Encoding.GetChars]charCount is negative");
349 ISO2022Decoder decoder = (ISO2022Decoder)baseDecoder;
357 iCount = GetCharsCP5022xJP( bytes, byteCount, chars, charCount, decoder);
360 iCount = GetCharsCP50225KR( bytes, byteCount, chars, charCount, decoder);
362 // Currently 50227 is the same as 936
364 // iCount = GetCharsCP50227CN( bytes, byteCount, chars, charCount, decoder);
367 iCount = GetCharsCP52936( bytes, byteCount, chars, charCount, decoder);
370 Contract.Assert(false, "[ISO2022Encoding.GetChars] had unexpected code page");
377 // ISO 2022 Code pages for JP.
378 // 50220 - No halfwidth Katakana, convert to full width
379 // 50221 - Use escape sequence for half width Katakana
380 // 50222 - Use shift-in/shift-out for half width Katakana
382 // These are the JIS code pages, superset of ISO-2022 / ISO-2022-JP-1
383 // 0E Shift Out (following bytes are Katakana)
384 // 0F Shift In (back to "normal" behavior)
385 // 21-7E Byte ranges (1 or 2 bytes)
386 // <ESC> $ @ To Double Byte 0208 Mode (actually older code page, but subset of 0208)
387 // <ESC> $ B To Double Byte 0208 Mode (duplicate)
388 // <ESC> $ ( D To Double Byte 0212 Mode (previously we misinterpreted this)
389 // <ESC> $ I To half width Katakana
390 // <ESC> ( J To JIS-Roman
391 // <ESC> ( H To JIS-Roman (swedish character set)
392 // <ESC> ( B To ASCII
393 // <ESC> & @ Alternate lead in to <ESC> $ B so just ignore it.
395 // So in Katakana mode we add 0x8e as a lead byte and use CP 20932 to convert it
396 // In ASCII mode we just spit out the single byte.
397 // In Roman mode we should change 0x5c (\) -> Yen sign and 0x7e (~) to Overline, however
398 // we didn't in mLang, otherwise roman is like ASCII.
399 // In 0208 double byte mode we have to |= with 0x8080 and use CP 20932 to convert it.
400 // In 0212 double byte mode we have to |= with 0x8000 and use CP 20932 to convert it.
402 // Note that JIS Shift In/Shift Out is different than the other ISO2022 encodings. For JIS
403 // Shift out always shifts to half-width Katakana. Chinese encodings use designator sequences
404 // instead of escape sequences and shift out to the designated sequence or back in to ASCII.
406 // When decoding JIS 0208, MLang used a '*' (0x2a) character in JIS 0208 mode to map the trailing byte
407 // to halfwidth katakana. I found no description of that behavior, however that block of 0208 is
408 // undefined, so we maintain that behavior when decoding. We will never generate characters using
409 // that technique, but the decoder will process them.
411 [System.Security.SecurityCritical] // auto-generated
412 private unsafe int GetBytesCP5022xJP(char* chars, int charCount,
413 byte* bytes, int byteCount, ISO2022Encoder encoder)
415 // prepare our helpers
416 Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer(
417 this, encoder, bytes, byteCount, chars, charCount);
420 ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Mode
421 ISO2022Modes shiftInMode = ISO2022Modes.ModeASCII; // Mode that shift in will go back to (only used by CP 50222)
426 char charLeftOver = encoder.charLeftOver;
428 currentMode = encoder.currentMode;
429 shiftInMode = encoder.shiftInOutMode;
431 // We may have a left over character from last time, try and process it.
432 if (charLeftOver > 0)
434 Contract.Assert(Char.IsHighSurrogate(charLeftOver), "[ISO2022Encoding.GetBytesCP5022xJP]leftover character should be high surrogate");
436 // It has to be a high surrogate, which we don't support, so it has to be a fallback
437 buffer.Fallback(charLeftOver);
441 while (buffer.MoreData)
444 char ch = buffer.GetNextChar();
447 ushort iBytes = mapUnicodeToBytes[ch];
450 // Check for halfwidth bytes
451 byte bLeadByte = (byte)(iBytes >> 8);
452 byte bTrailByte = (byte)(iBytes & 0xff);
454 if (bLeadByte == LEADBYTE_HALFWIDTH)
456 // Its Halfwidth Katakana
457 if (CodePage == 50220)
459 // CodePage 50220 doesn't use halfwidth Katakana, convert to fullwidth
460 // See if its out of range, fallback if so, throws if recursive fallback
461 if (bTrailByte < 0x21 || bTrailByte >= 0x21 + HalfToFullWidthKanaTable.Length)
467 // Get the full width katakana char to use.
468 iBytes = unchecked((ushort)(HalfToFullWidthKanaTable[bTrailByte - 0x21] & 0x7F7F));
470 // May have to do all sorts of fun stuff for mode, go back to start convert
474 // Can use halfwidth Katakana, make sure we're in right mode
476 // Make sure we're in right mode
477 if (currentMode != ISO2022Modes.ModeHalfwidthKatakana)
479 // 50222 or 50221, either shift in/out or escape to get to Katakana mode
480 if (CodePage == 50222)
483 if (!buffer.AddByte(SHIFT_OUT))
484 break; // convert out of space, stop
486 // Don't change modes until after AddByte in case it fails for convert
487 // We get to shift out to Katakana, make sure we'll go back to the right mode
488 // (This ends up always being ASCII)
489 shiftInMode = currentMode;
490 currentMode = ISO2022Modes.ModeHalfwidthKatakana;
494 // 50221 does halfwidth katakana by escape sequence
495 Contract.Assert(CodePage == 50221, "[ISO2022Encoding.GetBytesCP5022xJP]Expected Code Page 50221");
497 // Add our escape sequence
498 if (!buffer.AddByte(ESCAPE, unchecked((byte)'('), unchecked((byte)'I')))
499 break; // convert out of space, stop
501 currentMode = ISO2022Modes.ModeHalfwidthKatakana;
505 // We know we're in Katakana mode now, so add it.
506 // Go ahead and add the Katakana byte. Our table tail bytes are 0x80 too big.
507 if (!buffer.AddByte(unchecked((byte)(bTrailByte & 0x7F))))
508 break; // convert out of space, stop
510 // Done with this one
513 else if (bLeadByte != 0)
516 // It's a double byte character.
519 // If we're CP 50222 we may have to shift in from Katakana mode first
520 if (CodePage == 50222 && currentMode == ISO2022Modes.ModeHalfwidthKatakana)
523 if (!buffer.AddByte(SHIFT_IN))
524 break; // convert out of space, stop
526 // Need to shift in from katakana. (Still might not be right, but won't be shifted out anyway)
527 currentMode = shiftInMode;
530 // Make sure we're in the right mode (JIS 0208 or JIS 0212)
531 // Note: Right now we don't use JIS 0212. Also this table'd be wrong
533 // Its JIS extension 0208
534 if (currentMode != ISO2022Modes.ModeJIS0208)
536 // Escape sequence, we can fail after this, mode will be correct for convert
537 if (!buffer.AddByte(ESCAPE, unchecked((byte)'$'), unchecked((byte)'B')))
538 break; // Convert out of space, stop
540 currentMode = ISO2022Modes.ModeJIS0208;
543 // Add our double bytes
544 if (!buffer.AddByte(unchecked((byte)(bLeadByte)), unchecked((byte)(bTrailByte))))
545 break; // Convert out of space, stop
548 else if (iBytes != 0 || ch == 0)
551 // If we're CP 50222 we may have to shift in from Katakana mode first
552 if (CodePage == 50222 && currentMode == ISO2022Modes.ModeHalfwidthKatakana)
555 if (!buffer.AddByte(SHIFT_IN))
556 break; // convert ran out of room
558 // Need to shift in from katakana. (Still might not be right, but won't be shifted out anyway)
559 currentMode = shiftInMode;
562 // Its a single byte character, switch to ASCII if we have to
563 if (currentMode != ISO2022Modes.ModeASCII)
565 if (!buffer.AddByte(ESCAPE,unchecked((byte)'('), unchecked((byte)'B')))
566 break; // convert ran out of room
568 currentMode = ISO2022Modes.ModeASCII;
571 // Add the ASCII char
572 if (!buffer.AddByte(bTrailByte))
573 break; // convert had no room left
577 // Its unknown, do fallback, throws if recursive (knows because we called InternalGetNextChar)
581 // Switch back to ASCII if MustFlush or no encoder
582 if (currentMode != ISO2022Modes.ModeASCII &&
583 (encoder == null || encoder.MustFlush))
585 // If we're CP 50222 we may have to shift in from Katakana mode first
586 if (CodePage == 50222 && currentMode == ISO2022Modes.ModeHalfwidthKatakana)
588 // Shift IN, only shift mode if necessary.
589 if (buffer.AddByte(SHIFT_IN))
590 // Need to shift in from katakana. (Still might not be right, but won't be shifted out anyway)
591 currentMode = shiftInMode;
593 // If not successful, convert will maintain state for next time, also
594 // AddByte will have decremented our char count, however we need it to remain the same
595 buffer.GetNextChar();
598 // switch back to ASCII to finish neatly
599 if (currentMode != ISO2022Modes.ModeASCII &&
600 (CodePage != 50222 || currentMode != ISO2022Modes.ModeHalfwidthKatakana))
602 // only shift if it was successful
603 if (buffer.AddByte(ESCAPE, unchecked((byte)'('), unchecked((byte)'B')))
604 currentMode = ISO2022Modes.ModeASCII;
606 // If not successful, convert will maintain state for next time, also
607 // AddByte will have decremented our char count, however we need it to remain the same
608 buffer.GetNextChar();
612 // Remember our encoder state
613 if (bytes != null && encoder != null)
615 // This is ASCII if we had to flush
616 encoder.currentMode = currentMode;
617 encoder.shiftInOutMode = shiftInMode;
619 if (!buffer.fallbackBuffer.bUsedEncoder)
621 encoder.charLeftOver = (char)0;
624 encoder.m_charsUsed = buffer.CharsUsed;
631 // ISO 2022 Code pages for Korean - CP 50225
633 // CP 50225 has Shift In/Shift Out codes, and a single designator sequence that is supposed
634 // to appear once in the file, at the beginning of a line, before any multibyte code points.
635 // So we stick the designator at the beginning of the output.
637 // These are the KR code page codes for ISO-2022-KR
638 // 0E Shift Out (following bytes are double byte)
639 // 0F Shift In (back to ASCII behavior)
640 // 21-7E Byte ranges (1 or 2 bytes)
641 // <ESC> $)C Double byte ISO-2022-KR designator
643 // Note that this encoding is a little different than other encodings. The <esc>$)C sequence
644 // should only appear once per file. (Actually I saw another spec/rfc that said at the beginning
645 // of each line, but it shouldn't really matter.)
647 // During decoding Mlang accepted ' ', '\t, and '\n' as their respective characters, even if
648 // it was in double byte mode. We maintain that behavior, although I couldn't find a reference or
649 // reason for that behavior. We never generate data using that shortcut.
651 // Also Mlang always assumed KR mode, even if the designator wasn't found yet, so we do that as
652 // well. So basically we just ignore <ESC>$)C when decoding.
654 [System.Security.SecurityCritical] // auto-generated
655 private unsafe int GetBytesCP50225KR(char* chars, int charCount,
656 byte* bytes, int byteCount, ISO2022Encoder encoder)
658 // prepare our helpers
659 Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer(
660 this, encoder, bytes, byteCount, chars, charCount);
663 ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Mode
664 ISO2022Modes shiftOutMode = ISO2022Modes.ModeASCII; // ModeKR if already stamped lead bytes
669 // May have leftover stuff
670 char charLeftOver = encoder.charLeftOver;
671 currentMode = encoder.currentMode;
672 shiftOutMode = encoder.shiftInOutMode;
674 // We may have a l left over character from last time, try and process it.
675 if (charLeftOver > 0)
677 Contract.Assert(Char.IsHighSurrogate(charLeftOver), "[ISO2022Encoding.GetBytesCP50225KR]leftover character should be high surrogate");
679 // It has to be a high surrogate, which we don't support, so it has to be a fallback
680 buffer.Fallback(charLeftOver);
684 while (buffer.MoreData)
687 char ch = buffer.GetNextChar();
690 ushort iBytes = mapUnicodeToBytes[ch];
692 // Check for double byte bytes
693 byte bLeadByte = (byte)(iBytes >> 8);
694 byte bTrailByte = (byte)(iBytes & 0xff);
699 // It's a double byte character.
702 // If we haven't done our Korean designator, then do so, if we have any input
703 if (shiftOutMode != ISO2022Modes.ModeKR)
705 // Add our code page designator sequence
706 if (!buffer.AddByte(ESCAPE, unchecked((byte)'$'), unchecked((byte)')'), unchecked((byte)'C')))
707 break; // No room during convert.
709 shiftOutMode = ISO2022Modes.ModeKR;
712 // May have to switch to ModeKR first
713 if (currentMode != ISO2022Modes.ModeKR)
715 if (!buffer.AddByte(SHIFT_OUT))
716 break; // No convert room
718 currentMode = ISO2022Modes.ModeKR;
722 if (!buffer.AddByte(bLeadByte, bTrailByte))
723 break; // no convert room
726 else if (iBytes != 0 || ch == 0)
728 // Its a single byte character, switch to ASCII if we have to
729 if (currentMode != ISO2022Modes.ModeASCII)
731 if (!buffer.AddByte(SHIFT_IN))
734 currentMode = ISO2022Modes.ModeASCII;
737 // Add the ASCII char
738 if (!buffer.AddByte(bTrailByte))
743 // Its unknown, do fallback, throws if recursive (knows because we called InternalGetNextChar)
747 // Switch back to ASCII if MustFlush or no encoder
748 if (currentMode != ISO2022Modes.ModeASCII &&
749 (encoder == null || encoder.MustFlush))
751 // Get back to ASCII to be safe. Only do it if it success.
752 if (buffer.AddByte(SHIFT_IN))
753 currentMode = ISO2022Modes.ModeASCII;
755 // If not successful, convert will maintain state for next time, also
756 // AddByte will have decremented our char count, however we need it to remain the same
757 buffer.GetNextChar();
760 // Remember our encoder state
761 if (bytes != null && encoder != null)
763 // If we didn't use the encoder, then there's no chars left over
764 if (!buffer.fallbackBuffer.bUsedEncoder)
766 encoder.charLeftOver = (char)0;
769 // This is ASCII if we had to flush
770 encoder.currentMode = currentMode;
772 // We don't use shift out mode, but if we've flushed we need to reset it so it doesn't
774 if (!encoder.MustFlush || encoder.charLeftOver != (char)0)
776 // We should be not flushing or converting
777 Contract.Assert(!encoder.MustFlush || !encoder.m_throwOnOverflow,
778 "[ISO2022Encoding.GetBytesCP50225KR]Expected no left over data or not flushing or not converting");
779 encoder.shiftInOutMode = shiftOutMode;
782 encoder.shiftInOutMode = ISO2022Modes.ModeASCII;
784 encoder.m_charsUsed = buffer.CharsUsed;
791 // CP52936 is HZ Encoding
792 // HZ Encoding has 4 shift sequences:
794 // ~} shift into 1 byte mode,
795 // ~{ shift into 2 byte GB 2312-80
796 // ~<NL> Maintain 2 byte mode across new lines (ignore both ~ and <NL> characters)
797 // (This is for mailers that restrict to 70 or 80 or whatever character lines)
799 // According to comment in mlang, lead & trail byte ranges are described in RFC 1843
800 // RFC 1843 => valid HZ code range: leading byte 0x21 - 0x77, 2nd byte 0x21 - 0x7e
801 // Our 936 code points are or'd with 0x8080, so lead byte 0xa1 - 0xf7, trail byte 0xa1 - 0xfe
803 // This encoding is designed for transmission by e-mail and news. No bytes should have high bit set.
804 // (all bytes <= 0x7f)
805 [System.Security.SecurityCritical] // auto-generated
806 private unsafe int GetBytesCP52936(char* chars, int charCount,
807 byte* bytes, int byteCount, ISO2022Encoder encoder)
809 // prepare our helpers
810 Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer(
811 this, encoder, bytes, byteCount, chars, charCount);
814 ISO2022Modes currentMode = ISO2022Modes.ModeASCII;
819 char charLeftOver = encoder.charLeftOver;
820 currentMode = encoder.currentMode;
822 // We may have a left over character from last time, try and process it.
823 if (charLeftOver > 0)
825 Contract.Assert(Char.IsHighSurrogate(charLeftOver), "[ISO2022Encoding.GetBytesCP52936]leftover character should be high surrogate");
827 // It has to be a high surrogate, which we don't support, so it has to be a fallback
828 buffer.Fallback(charLeftOver);
832 while (buffer.MoreData)
835 char ch = buffer.GetNextChar();
838 ushort sChar = mapUnicodeToBytes[ch];
839 if (sChar == 0 && ch != 0)
841 // Wasn't a legal byte sequence, its a surrogate or fallback
842 // Throws if recursive (knows because we called InternalGetNextChar)
845 // Done with our char, now process fallback
849 // Check for halfwidth bytes
850 byte bLeadByte = (byte)(sChar >> 8);
851 byte bTrailByte = (byte)(sChar & 0xff);
853 // If its a double byte, it has to fit in the lead byte 0xa1 - 0xf7, trail byte 0xa1 - 0xfe range
854 // (including the 0x8080 that our codepage or's to the value)
855 if ((bLeadByte != 0 &&
856 (bLeadByte < 0xa1 || bLeadByte > 0xf7 || bTrailByte < 0xa1 || bTrailByte > 0xfe)) ||
857 (bLeadByte == 0 && bTrailByte > 0x80 && bTrailByte != 0xff))
859 // Illegal character, in 936 code page, but not in HZ subset, get fallback for it
864 // sChar is now either ASCII or has an 0x8080 mask
867 // Its a double byte mode
868 if (currentMode != ISO2022Modes.ModeHZ)
870 // Need to add the double byte mode marker
871 if (!buffer.AddByte((byte)'~', (byte)'{', 2))
872 break; // Stop if no buffer space in convert
874 currentMode = ISO2022Modes.ModeHZ;
877 // Go ahead and add the 2 bytes
878 if (!buffer.AddByte(unchecked((byte)(bLeadByte & 0x7f)), unchecked((byte)(bTrailByte & 0x7f))))
879 break; // Stop if no buffer space in convert
883 // Its supposed to be ASCII
884 if (currentMode != ISO2022Modes.ModeASCII)
886 // Need to add the ASCII mode marker
887 // Will have 1 more byte (or 2 if ~)
888 if (!buffer.AddByte((byte)'~', (byte)'}', bTrailByte == '~' ? 2:1))
891 currentMode = ISO2022Modes.ModeASCII;
894 // If its a '~' we'll need an extra one
895 if (bTrailByte == '~')
897 // Need to add the extra ~
898 if (!buffer.AddByte((byte)'~', 1))
902 // Need to add the character
903 if (!buffer.AddByte(bTrailByte))
908 // Add ASCII shift out if we're at end of decoder
909 if (currentMode != ISO2022Modes.ModeASCII &&
910 (encoder == null || encoder.MustFlush))
912 // Need to add the ASCII mode marker
913 // Only turn off other mode if this works
914 if (buffer.AddByte((byte)'~',(byte)'}'))
915 currentMode = ISO2022Modes.ModeASCII;
917 // If not successful, convert will maintain state for next time, also
918 // AddByte will have decremented our char count, however we need it to remain the same
919 buffer.GetNextChar();
922 // Need to remember our mode
923 if (encoder != null && bytes != null)
925 // This is ASCII if we had to flush
926 encoder.currentMode = currentMode;
928 if (!buffer.fallbackBuffer.bUsedEncoder)
930 encoder.charLeftOver = (char)0;
933 encoder.m_charsUsed = buffer.CharsUsed;
940 [System.Security.SecurityCritical] // auto-generated
941 private unsafe int GetCharsCP5022xJP(byte* bytes, int byteCount,
942 char* chars, int charCount, ISO2022Decoder decoder)
945 Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer(
946 this, decoder, chars, charCount, bytes, byteCount);
948 // No mode information yet
949 ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Our current Mode
950 ISO2022Modes shiftInMode = ISO2022Modes.ModeASCII; // Mode that we'll shift in to
951 byte[] escapeBytes = new byte[4];
956 currentMode = decoder.currentMode;
957 shiftInMode = decoder.shiftInOutMode;
959 // See if we have leftover decoder buffer to use
960 // Load our bytesLeftOver
961 escapeCount = decoder.bytesLeftOverCount;
963 // Don't want to mess up decoder if we're counting or throw an exception
964 for (int i = 0; i < escapeCount; i++)
965 escapeBytes[i] = decoder.bytesLeftOver[i];
968 // Do this until the end
969 while (buffer.MoreData || escapeCount > 0)
975 // Get more escape sequences if necessary
976 if (escapeBytes[0] == ESCAPE)
978 // Stop if no more input
979 if (!buffer.MoreData)
981 if (decoder != null && !decoder.MustFlush)
986 // Add it to the sequence we can check
987 escapeBytes[escapeCount++] = buffer.GetNextByte();
989 // We have an escape sequence
990 ISO2022Modes modeReturn =
991 CheckEscapeSequenceJP(escapeBytes, escapeCount);
993 if (modeReturn != ISO2022Modes.ModeInvalidEscape)
995 if (modeReturn != ISO2022Modes.ModeIncompleteEscape)
997 // Processed escape correctly
1000 // We're now this mode
1001 currentMode = shiftInMode = modeReturn;
1004 // Either way, continue to get next escape or real byte
1009 // If ModeInvalidEscape, or no input & must flush, then fall through to add escape.
1012 // Read next escape byte and move them down one.
1013 ch = DecrementEscapeBytes(ref escapeBytes, ref escapeCount);
1017 // Get our next byte
1018 ch = buffer.GetNextByte();
1022 // We'll have an escape sequence, use it if we don't have one buffered already
1023 if (escapeCount == 0)
1025 // Start this new escape sequence
1026 escapeBytes[0] = ch;
1031 // Flush the previous escape sequence, then reuse this escape byte
1032 buffer.AdjustBytes(-1);
1036 if (ch == SHIFT_OUT)
1038 shiftInMode = currentMode;
1039 currentMode = ISO2022Modes.ModeHalfwidthKatakana;
1042 else if (ch == SHIFT_IN)
1044 currentMode = shiftInMode;
1048 // Get our full character
1050 bool b2Bytes = false;
1052 if (currentMode == ISO2022Modes.ModeJIS0208)
1055 // To handle errors, we need to check:
1056 // 1. if trailbyte is there
1057 // 2. if code is valid
1059 if (escapeCount > 0)
1061 // Let another escape fall through
1062 if (escapeBytes[0] != ESCAPE)
1064 // Move them down one & get the next data
1066 iBytes |= DecrementEscapeBytes(ref escapeBytes, ref escapeCount);
1070 else if (buffer.MoreData)
1073 iBytes |= buffer.GetNextByte();
1078 // Not enough input, use decoder if possible
1079 if (decoder == null || decoder.MustFlush)
1081 // No decoder, do fallback for this byte
1082 buffer.Fallback(ch);
1086 // Stick it in the decoder if we're not counting
1089 escapeBytes[0] = ch;
1095 // MLang treated JIS 0208 '*' lead byte like a single halfwidth katakana
1096 // escape, so use 0x8e00 as katakana lead byte and keep same trail byte.
1097 // 0x2a lead byte range is normally unused in JIS 0208, so shouldn't have
1098 // any wierd compatibility issues.
1099 if ((b2Bytes == true) && ((iBytes & 0xff00) == 0x2a00))
1101 iBytes = (ushort)(iBytes & 0xff);
1102 iBytes |= (LEADBYTE_HALFWIDTH << 8); // Put us in the halfwidth katakana range
1105 else if (iBytes >= 0xA1 && iBytes <= 0xDF)
1107 // Everett accidentally mapped Katakana like shift-jis (932),
1108 // even though this is a 7 bit code page. We keep that mapping
1109 iBytes |= (LEADBYTE_HALFWIDTH << 8); // Map to halfwidth katakana range
1110 iBytes &= 0xff7f; // remove extra 0x80
1112 else if (currentMode == ISO2022Modes.ModeHalfwidthKatakana )
1114 // Add 0x10 lead byte that our encoding expects for Katakana:
1115 iBytes |= (LEADBYTE_HALFWIDTH << 8);
1118 // We have an iBytes to try to convert.
1119 char c = mapBytesToUnicode[iBytes];
1121 // See if it was unknown
1122 if (c == UNKNOWN_CHAR_FLAG && iBytes != 0)
1124 // Have to do fallback
1127 if (!buffer.Fallback((byte)(iBytes >> 8), (byte)iBytes))
1132 if (!buffer.Fallback(ch))
1138 // If we were JIS 0208, then we consumed an extra byte
1139 if (!buffer.AddChar(c, b2Bytes ? 2:1))
1144 // Make sure our decoder state matches our mode, if not counting
1145 if (chars != null && decoder != null)
1147 // Remember it if we don't flush
1148 if (!decoder.MustFlush || escapeCount != 0)
1150 // Either not flushing or had state (from convert)
1151 Contract.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow,
1152 "[ISO2022Encoding.GetCharsCP5022xJP]Expected no state or not converting or not flushing");
1154 decoder.currentMode = currentMode;
1155 decoder.shiftInOutMode = shiftInMode;
1157 // Remember escape buffer
1158 decoder.bytesLeftOverCount = escapeCount;
1159 decoder.bytesLeftOver = escapeBytes;
1163 // We flush, clear buffer
1164 decoder.currentMode = ISO2022Modes.ModeASCII;
1165 decoder.shiftInOutMode = ISO2022Modes.ModeASCII;
1166 decoder.bytesLeftOverCount = 0;
1167 // Slightly different if counting/not counting
1170 decoder.m_bytesUsed = buffer.BytesUsed;
1173 // Return # of characters we found
1174 return buffer.Count;
1177 // We know we have an escape sequence, so check it starting with the byte after the escape
1178 private ISO2022Modes CheckEscapeSequenceJP( byte[] bytes, int escapeCount )
1180 // Have an escape sequence
1181 if (bytes[0] != ESCAPE)
1182 return ISO2022Modes.ModeInvalidEscape;
1184 if (escapeCount < 3)
1185 return ISO2022Modes.ModeIncompleteEscape;
1187 if (bytes[1] == '(')
1189 if (bytes[2] == 'B') // <esc>(B
1191 return ISO2022Modes.ModeASCII;
1193 else if (bytes[2] == 'H') // <esc>(H
1195 // Actually this is supposed to be Swedish
1196 // We treat it like ASCII though.
1197 return ISO2022Modes.ModeASCII;
1199 else if (bytes[2] == 'J') // <esc>(J
1201 // Actually this is supposed to be Roman
1202 // 2 characters are different, but historically we treat it as ascii
1203 return ISO2022Modes.ModeASCII;
1205 else if (bytes[2] == 'I') // <esc>(I
1207 return ISO2022Modes.ModeHalfwidthKatakana;
1210 else if (bytes[1] == '$')
1212 if (bytes[2] == '@' || // <esc>$@
1213 bytes[2] == 'B') // <esc>$B
1215 return ISO2022Modes.ModeJIS0208;
1219 // Looking for <esc>$(D
1220 if (escapeCount < 4)
1221 return ISO2022Modes.ModeIncompleteEscape;
1223 if (bytes[2] == '(' && bytes[3] == 'D') // <esc>$(D
1225 // Mlang treated 0208 like 0212 even though that's wrong
1226 return ISO2022Modes.ModeJIS0208;
1230 else if (bytes[1] == '&')
1232 if (bytes[2] == '@') // <esc>&@
1234 // Ignore ESC & @ (prefix to <esc>$B)
1235 return ISO2022Modes.ModeNOOP;
1239 // If we get here we fell through and have an invalid/unknown escape sequence
1240 return ISO2022Modes.ModeInvalidEscape;
1243 private byte DecrementEscapeBytes(ref byte[] bytes, ref int count)
1245 Contract.Assert(count > 0, "[ISO2022Encoding.DecrementEscapeBytes]count > 0");
1247 // Decrement our count
1250 // Remember the first one
1251 byte returnValue = bytes[0];
1253 // Move them down one.
1254 for (int i = 0; i < count; i++)
1256 bytes[i] = bytes[i+1];
1259 // Clear out the last byte
1262 // Return the old 1st byte
1266 // Note that in DBCS mode mlang passed through ' ', '\t' and '\n' as SBCS characters
1267 // probably to allow mailer formatting without too much extra work.
1268 [System.Security.SecurityCritical] // auto-generated
1269 private unsafe int GetCharsCP50225KR(byte* bytes, int byteCount,
1270 char* chars, int charCount, ISO2022Decoder decoder)
1273 Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer(
1274 this, decoder, chars, charCount, bytes, byteCount);
1276 // No mode information yet
1277 ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Our current Mode
1279 byte[] escapeBytes = new byte[4];
1280 int escapeCount = 0;
1282 if (decoder != null)
1284 currentMode = decoder.currentMode;
1286 // See if we have leftover decoder buffer to use
1287 // Load our bytesLeftOver
1288 escapeCount = decoder.bytesLeftOverCount;
1290 // Don't want to mess up decoder if we're counting or throw an exception
1291 for (int i = 0; i < escapeCount; i++)
1292 escapeBytes[i] = decoder.bytesLeftOver[i];
1295 // Do this until the end, just do '?' replacement because we don't have fallbacks for decodings.
1296 while (buffer.MoreData || escapeCount > 0)
1300 if (escapeCount > 0)
1302 // Get more escape sequences if necessary
1303 if (escapeBytes[0] == ESCAPE)
1305 // Stop if no more input
1306 if (!buffer.MoreData)
1308 if (decoder != null && !decoder.MustFlush)
1313 // Add it to the sequence we can check
1314 escapeBytes[escapeCount++] = buffer.GetNextByte();
1316 // We have an escape sequence
1317 ISO2022Modes modeReturn =
1318 CheckEscapeSequenceKR(escapeBytes, escapeCount);
1320 if (modeReturn != ISO2022Modes.ModeInvalidEscape)
1322 if (modeReturn != ISO2022Modes.ModeIncompleteEscape)
1324 // Processed escape correctly, no effect (we know about KR mode)
1328 // Either way, continue to get next escape or real byte
1333 // If ModeInvalidEscape, or no input & must flush, then fall through to add escape.
1336 // Still have something left over in escape buffer
1337 // Get it and move them down one
1338 ch = DecrementEscapeBytes(ref escapeBytes, ref escapeCount);
1342 // Get our next byte
1343 ch = buffer.GetNextByte();
1347 // We'll have an escape sequence, use it if we don't have one buffered already
1348 if (escapeCount == 0)
1350 // Start this new escape sequence
1351 escapeBytes[0] = ch;
1356 // Flush previous escape sequence, then reuse this escape byte
1357 buffer.AdjustBytes(-1);
1361 if (ch == SHIFT_OUT)
1363 currentMode = ISO2022Modes.ModeKR;
1366 else if (ch == SHIFT_IN)
1368 currentMode = ISO2022Modes.ModeASCII;
1372 // Get our full character
1374 bool b2Bytes = false;
1376 // MLANG was passing through ' ', '\t' and '\n', so we do so as well, but I don't see that in the RFC.
1377 if (currentMode == ISO2022Modes.ModeKR && ch != ' ' && ch != '\t' && ch != '\n')
1380 // To handle errors, we need to check:
1381 // 1. if trailbyte is there
1382 // 2. if code is valid
1384 if (escapeCount > 0)
1386 // Let another escape fall through
1387 if (escapeBytes[0] != ESCAPE)
1389 // Move them down one & get the next data
1391 iBytes |= DecrementEscapeBytes(ref escapeBytes, ref escapeCount);
1395 else if (buffer.MoreData)
1398 iBytes |= buffer.GetNextByte();
1403 // Not enough input, use decoder if possible
1404 if (decoder == null || decoder.MustFlush)
1406 // No decoder, do fallback for lonely 1st byte
1407 buffer.Fallback(ch);
1411 // Stick it in the decoder if we're not counting
1414 escapeBytes[0] = ch;
1421 // We have a iBytes to try to convert.
1422 char c = mapBytesToUnicode[iBytes];
1424 // See if it was unknown
1425 if (c == UNKNOWN_CHAR_FLAG && iBytes != 0)
1427 // Have to do fallback
1430 if (!buffer.Fallback((byte)(iBytes >> 8), (byte)iBytes))
1435 if (!buffer.Fallback(ch))
1441 if (!buffer.AddChar(c, b2Bytes ? 2:1))
1446 // Make sure our decoder state matches our mode, if not counting
1447 if (chars != null && decoder != null)
1449 // Remember it if we don't flush
1450 if (!decoder.MustFlush || escapeCount != 0)
1452 // Either not flushing or had state (from convert)
1453 Contract.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow,
1454 "[ISO2022Encoding.GetCharsCP50225KR]Expected no state or not converting or not flushing");
1456 decoder.currentMode = currentMode;
1458 // Remember escape buffer
1459 decoder.bytesLeftOverCount = escapeCount;
1460 decoder.bytesLeftOver = escapeBytes;
1464 // We flush, clear buffer
1465 decoder.currentMode = ISO2022Modes.ModeASCII;
1466 decoder.shiftInOutMode = ISO2022Modes.ModeASCII;
1467 decoder.bytesLeftOverCount = 0;
1470 decoder.m_bytesUsed = buffer.BytesUsed;
1473 // Return # of characters we found
1474 return buffer.Count;
1477 // We know we have an escape sequence, so check it starting with the byte after the escape
1478 private ISO2022Modes CheckEscapeSequenceKR( byte[] bytes, int escapeCount )
1480 // Have an escape sequence
1481 if (bytes[0] != ESCAPE)
1482 return ISO2022Modes.ModeInvalidEscape;
1484 if (escapeCount < 4)
1485 return ISO2022Modes.ModeIncompleteEscape;
1487 if (bytes[1] == '$' && bytes[2] == ')' && bytes[3] == 'C') // <esc>$)C
1488 return ISO2022Modes.ModeKR;
1490 // If we get here we fell through and have an invalid/unknown escape sequence
1491 return ISO2022Modes.ModeInvalidEscape;
1494 // CP52936 is HZ Encoding
1495 // HZ Encoding has 4 shift sequences:
1497 // ~} shift into 1 byte mode,
1498 // ~{ shift into 2 byte GB 2312-80
1499 // ~<NL> Maintain 2 byte mode across new lines (ignore both ~ and <NL> characters)
1500 // (This is for mailers that restrict to 70 or 80 or whatever character lines)
1502 // According to comment in mlang, lead & trail byte ranges are described in RFC 1843
1503 // RFC 1843 => valid HZ code range: leading byte 0x21 - 0x77, 2nd byte 0x21 - 0x7e
1504 // Our 936 code points are or'd with 0x8080, so lead byte 0xa1 - 0xf7, trail byte 0xa1 - 0xfe
1506 // This encoding is designed for transmission by e-mail and news. No bytes should have high bit set.
1507 // (all bytes <= 0x7f)
1508 [System.Security.SecurityCritical] // auto-generated
1509 private unsafe int GetCharsCP52936(byte* bytes, int byteCount,
1510 char* chars, int charCount, ISO2022Decoder decoder)
1512 Contract.Assert(byteCount >=0, "[ISO2022Encoding.GetCharsCP52936]count >=0");
1513 Contract.Assert(bytes!=null, "[ISO2022Encoding.GetCharsCP52936]bytes!=null");
1516 Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer(
1517 this, decoder, chars, charCount, bytes, byteCount);
1519 // No mode information yet
1520 ISO2022Modes currentMode = ISO2022Modes.ModeASCII;
1521 int byteLeftOver = -1;
1522 bool bUsedDecoder = false;
1524 if (decoder != null)
1526 currentMode = decoder.currentMode;
1527 // See if we have leftover decoder buffer to use
1528 // Don't want to mess up decoder if we're counting or throw an exception
1529 if (decoder.bytesLeftOverCount != 0 )
1531 // Load our bytesLeftOver
1532 byteLeftOver = decoder.bytesLeftOver[0];
1536 // Do this until the end, just do '?' replacement because we don't have fallbacks for decodings.
1537 while (buffer.MoreData || byteLeftOver >= 0)
1541 // May have a left over byte
1542 if (byteLeftOver >= 0)
1544 ch = (byte)byteLeftOver;
1549 ch = buffer.GetNextByte();
1552 // We're in escape mode
1555 // Next char is type of switch
1556 if (!buffer.MoreData)
1558 // We don't have anything left, it'll be in decoder or a ?
1559 // don't fail if we are allowing overflows
1560 if (decoder == null || decoder.MustFlush)
1563 buffer.Fallback(ch);
1564 // break if we fail & break if we don't (because !MoreData)
1565 // Add succeeded, continue
1569 // Stick it in decoder
1570 if (decoder != null)
1571 decoder.ClearMustFlush();
1575 decoder.bytesLeftOverCount = 1;
1576 decoder.bytesLeftOver[0] = (byte)'~';
1577 bUsedDecoder = true;
1582 // What type is it?, get 2nd byte
1583 ch = buffer.GetNextByte();
1585 if (ch == '~' && currentMode == ISO2022Modes.ModeASCII)
1587 // Its just a ~~ replacement for ~, add it
1588 if (!buffer.AddChar((char)ch, 2))
1589 // Add failed, break for converting
1592 // Add succeeded, continue
1597 // Switching to Double Byte mode
1598 currentMode = ISO2022Modes.ModeHZ;
1603 // Switching to ASCII mode
1604 currentMode = ISO2022Modes.ModeASCII;
1607 else if (ch == '\n')
1609 // Ignore ~\n sequence
1614 // Unknown escape, back up and try the '~' as a "normal" byte or lead byte
1615 buffer.AdjustBytes(-1);
1620 // go ahead and add our data
1621 if (currentMode != ISO2022Modes.ModeASCII)
1624 Contract.Assert(currentMode == ISO2022Modes.ModeHZ, "[ISO2022Encoding.GetCharsCP52936]Expected ModeHZ");
1627 // Everett allowed characters < 0x20 to be passed as if they were ASCII
1634 // Its multibyte, should have another byte
1635 if (!buffer.MoreData)
1638 // don't fail if we are allowing overflows
1639 if (decoder == null || decoder.MustFlush)
1641 // Not enough bytes, fallback lead byte
1642 buffer.Fallback(ch);
1644 // Break if we fail & break because !MoreData
1648 if (decoder != null)
1649 decoder.ClearMustFlush();
1651 // Stick it in decoder
1654 decoder.bytesLeftOverCount = 1;
1655 decoder.bytesLeftOver[0] = ch;
1656 bUsedDecoder = true;
1661 // Everett uses space as an escape character for single SBCS bytes
1662 byte ch2 = buffer.GetNextByte();
1663 ushort iBytes = (ushort)(ch << 8 | ch2);
1665 if (ch == ' ' && ch2 != 0)
1667 // Get next char and treat it like ASCII (Everett treated space like an escape
1668 // allowing the next char to be just ascii)
1670 goto STOREMULTIBYTE;
1673 // Bytes should be in range: lead byte 0x21-0x77, trail byte: 0x21 - 0x7e
1674 if ((ch < 0x21 || ch > 0x77 || ch2 < 0x21 || ch2 > 0x7e) &&
1675 // Everett allowed high bit mappings for same characters (but only if both bits set)
1676 (ch < 0xa1 || ch > 0xf7 || ch2 < 0xa1 || ch2 > 0xfe))
1678 // For some reason Everett allowed XX20 to become unicode 3000... (ideo sp)
1679 if (ch2 == 0x20 && 0x21 <= ch && ch <= 0x7d)
1685 // Illegal char, use fallback. If lead byte is 0 have to do it special and do it first
1686 if (!buffer.Fallback((byte)(iBytes>>8), (byte)(iBytes)))
1693 // Look up the multibyte char to stick it in our data
1695 // We have a iBytes to try to convert.
1696 cm = mapBytesToUnicode[iBytes];
1700 // See if it was unknown
1701 if (cm == UNKNOWN_CHAR_FLAG && iBytes != 0)
1703 // Fall back the unknown stuff
1704 if (!buffer.Fallback((byte)(iBytes>>8), (byte)(iBytes)))
1709 if (!buffer.AddChar(cm, 2))
1710 break; // convert ran out of buffer, stop
1715 // We allow some chars > 7f because everett did, so we have to look them up.
1717 char c = mapBytesToUnicode[ch];
1719 // Check if it was unknown
1720 if ((c == UNKNOWN_CHAR_FLAG || c == 0) && (ch != 0))
1722 // fallback the unkown bytes
1723 if (!buffer.Fallback((byte)ch))
1728 // Go ahead and add our ASCII character
1729 if (!buffer.AddChar(c))
1730 break; // convert ran out of buffer, stop
1733 // Need to remember our state, IF we're not counting
1734 if (chars != null && decoder != null)
1738 // If we didn't use it, clear the byte left over
1739 decoder.bytesLeftOverCount = 0;
1742 if (decoder.MustFlush && decoder.bytesLeftOverCount == 0)
1744 decoder.currentMode = ISO2022Modes.ModeASCII;
1748 // Either not flushing or had state (from convert)
1749 Contract.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow,
1750 "[ISO2022Encoding.GetCharsCP52936]Expected no state or not converting or not flushing");
1752 decoder.currentMode = currentMode;
1754 decoder.m_bytesUsed = buffer.BytesUsed;
1757 // Return # of characters we found
1758 return buffer.Count;
1761 // Note: These all end up with 1/2 bytes of average byte count, so unless we're 1 we're always
1762 // charCount/2 bytes too big.
1763 public override int GetMaxByteCount(int charCount)
1766 throw new ArgumentOutOfRangeException("charCount",
1767 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
1768 Contract.EndContractBlock();
1770 // Characters would be # of characters + 1 in case high surrogate is ? * max fallback
1771 long byteCount = (long)charCount + 1;
1773 if (EncoderFallback.MaxCharCount > 1)
1774 byteCount *= EncoderFallback.MaxCharCount;
1776 // Start with just generic DBCS values (sort of).
1785 // 2 bytes per char + 3 bytes switch to JIS 0208 or 1 byte + 3 bytes switch to 1 byte CP
1786 perChar = 5; // 5 max (4.5 average)
1787 extraEnd = 3; // 3 bytes to shift back to ASCII
1790 // 2 bytes per char + 3 bytes switch to JIS 0208 or 1 byte + 3 bytes switch to 1 byte CP
1791 perChar = 5; // 5 max (4.5 average)
1792 extraEnd = 4; // 1 byte to shift from Katakana -> DBCS, 3 bytes to shift back to ASCII from DBCS
1795 // 2 bytes per char + 1 byte SO, or 1 byte per char + 1 byte SI.
1796 perChar = 3; // 3 max, (2.5 average)
1797 extraStart = 4; // EUC-KR marker appears at beginning of file.
1798 extraEnd = 1; // 1 byte to shift back to ascii if necessary.
1801 // 2 bytes per char + 2 byte shift, or 1 byte + 1 byte shift
1802 // Worst case: left over surrogate with no low surrogate is extra ?, could have to switch to ASCII, then could have HZ and flush to ASCII mode
1803 perChar = 4; // 4 max, (3.5 average if every other char is HZ/ASCII)
1804 extraEnd = 2; // 2 if we have to shift back to ASCII
1808 // Return our surrogate and End plus perChar for each char.
1809 byteCount *= perChar;
1810 byteCount += extraStart + extraEnd;
1812 if (byteCount > 0x7fffffff)
1813 throw new ArgumentOutOfRangeException("charCount", Environment.GetResourceString("ArgumentOutOfRange_GetByteCountOverflow"));
1815 return (int)byteCount;
1818 public override int GetMaxCharCount(int byteCount)
1821 throw new ArgumentOutOfRangeException("byteCount",
1822 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum"));
1823 Contract.EndContractBlock();
1826 int extraDecoder = 1;
1834 perChar = 1; // Worst case all ASCII
1835 extraDecoder = 3; // Could have left over 3 chars of 4 char escape sequence, that all become ?
1838 perChar = 1; // Worst case all ASCII
1839 extraDecoder = 1; // sequences are 2 chars, so if next one is illegal, then previous 1 could be ?
1843 // Figure out our length, perchar * char + whatever extra our decoder could do to us.
1844 long charCount = ((long)byteCount * perChar) + extraDecoder;
1846 // Just in case we have to fall back unknown ones.
1847 if (DecoderFallback.MaxCharCount > 1)
1848 charCount *= DecoderFallback.MaxCharCount;
1850 if (charCount > 0x7fffffff)
1851 throw new ArgumentOutOfRangeException("byteCount", Environment.GetResourceString("ArgumentOutOfRange_GetCharCountOverflow"));
1853 return (int)charCount;
1856 public override Encoder GetEncoder()
1858 return new ISO2022Encoder(this);
1861 public override Decoder GetDecoder()
1863 return new ISO2022Decoder(this);
1867 internal class ISO2022Encoder : System.Text.EncoderNLS
1869 internal ISO2022Modes currentMode;
1870 internal ISO2022Modes shiftInOutMode;
1872 internal ISO2022Encoder(EncodingNLS encoding) : base(encoding)
1877 public override void Reset()
1880 currentMode = ISO2022Modes.ModeASCII;
1881 shiftInOutMode = ISO2022Modes.ModeASCII;
1882 charLeftOver = (char)0;
1883 if (m_fallbackBuffer != null)
1884 m_fallbackBuffer.Reset();
1887 // Anything left in our encoder?
1888 internal override bool HasState
1892 // Don't check shift-out mode, it may be ascii (JP) or not (KR)
1893 return (this.charLeftOver != (char)0 ||
1894 currentMode != ISO2022Modes.ModeASCII);
1900 internal class ISO2022Decoder : System.Text.DecoderNLS
1902 internal byte[] bytesLeftOver;
1903 internal int bytesLeftOverCount;
1904 internal ISO2022Modes currentMode;
1905 internal ISO2022Modes shiftInOutMode;
1907 internal ISO2022Decoder(EncodingNLS encoding) : base(encoding)
1912 public override void Reset()
1915 bytesLeftOverCount = 0;
1916 bytesLeftOver = new byte[4];
1917 currentMode = ISO2022Modes.ModeASCII;
1918 shiftInOutMode = ISO2022Modes.ModeASCII;
1919 if (m_fallbackBuffer != null)
1920 m_fallbackBuffer.Reset();
1923 // Anything left in our decoder?
1924 internal override bool HasState
1928 // If have bytes left over or not shifted back to ASCII then have problem
1929 return (this.bytesLeftOverCount != 0 ||
1930 currentMode != ISO2022Modes.ModeASCII);
1935 static ushort[] HalfToFullWidthKanaTable =
1937 0xa1a3, // 0x8ea1 : Halfwidth Ideographic Period
1938 0xa1d6, // 0x8ea2 : Halfwidth Opening Corner Bracket
1939 0xa1d7, // 0x8ea3 : Halfwidth Closing Corner Bracket
1940 0xa1a2, // 0x8ea4 : Halfwidth Ideographic Comma
1941 0xa1a6, // 0x8ea5 : Halfwidth Katakana Middle Dot
1942 0xa5f2, // 0x8ea6 : Halfwidth Katakana Wo
1943 0xa5a1, // 0x8ea7 : Halfwidth Katakana Small A
1944 0xa5a3, // 0x8ea8 : Halfwidth Katakana Small I
1945 0xa5a5, // 0x8ea9 : Halfwidth Katakana Small U
1946 0xa5a7, // 0x8eaa : Halfwidth Katakana Small E
1947 0xa5a9, // 0x8eab : Halfwidth Katakana Small O
1948 0xa5e3, // 0x8eac : Halfwidth Katakana Small Ya
1949 0xa5e5, // 0x8ead : Halfwidth Katakana Small Yu
1950 0xa5e7, // 0x8eae : Halfwidth Katakana Small Yo
1951 0xa5c3, // 0x8eaf : Halfwidth Katakana Small Tu
1952 0xa1bc, // 0x8eb0 : Halfwidth Katakana-Hiragana Prolonged Sound Mark
1953 0xa5a2, // 0x8eb1 : Halfwidth Katakana A
1954 0xa5a4, // 0x8eb2 : Halfwidth Katakana I
1955 0xa5a6, // 0x8eb3 : Halfwidth Katakana U
1956 0xa5a8, // 0x8eb4 : Halfwidth Katakana E
1957 0xa5aa, // 0x8eb5 : Halfwidth Katakana O
1958 0xa5ab, // 0x8eb6 : Halfwidth Katakana Ka
1959 0xa5ad, // 0x8eb7 : Halfwidth Katakana Ki
1960 0xa5af, // 0x8eb8 : Halfwidth Katakana Ku
1961 0xa5b1, // 0x8eb9 : Halfwidth Katakana Ke
1962 0xa5b3, // 0x8eba : Halfwidth Katakana Ko
1963 0xa5b5, // 0x8ebb : Halfwidth Katakana Sa
1964 0xa5b7, // 0x8ebc : Halfwidth Katakana Si
1965 0xa5b9, // 0x8ebd : Halfwidth Katakana Su
1966 0xa5bb, // 0x8ebe : Halfwidth Katakana Se
1967 0xa5bd, // 0x8ebf : Halfwidth Katakana So
1968 0xa5bf, // 0x8ec0 : Halfwidth Katakana Ta
1969 0xa5c1, // 0x8ec1 : Halfwidth Katakana Ti
1970 0xa5c4, // 0x8ec2 : Halfwidth Katakana Tu
1971 0xa5c6, // 0x8ec3 : Halfwidth Katakana Te
1972 0xa5c8, // 0x8ec4 : Halfwidth Katakana To
1973 0xa5ca, // 0x8ec5 : Halfwidth Katakana Na
1974 0xa5cb, // 0x8ec6 : Halfwidth Katakana Ni
1975 0xa5cc, // 0x8ec7 : Halfwidth Katakana Nu
1976 0xa5cd, // 0x8ec8 : Halfwidth Katakana Ne
1977 0xa5ce, // 0x8ec9 : Halfwidth Katakana No
1978 0xa5cf, // 0x8eca : Halfwidth Katakana Ha
1979 0xa5d2, // 0x8ecb : Halfwidth Katakana Hi
1980 0xa5d5, // 0x8ecc : Halfwidth Katakana Hu
1981 0xa5d8, // 0x8ecd : Halfwidth Katakana He
1982 0xa5db, // 0x8ece : Halfwidth Katakana Ho
1983 0xa5de, // 0x8ecf : Halfwidth Katakana Ma
1984 0xa5df, // 0x8ed0 : Halfwidth Katakana Mi
1985 0xa5e0, // 0x8ed1 : Halfwidth Katakana Mu
1986 0xa5e1, // 0x8ed2 : Halfwidth Katakana Me
1987 0xa5e2, // 0x8ed3 : Halfwidth Katakana Mo
1988 0xa5e4, // 0x8ed4 : Halfwidth Katakana Ya
1989 0xa5e6, // 0x8ed5 : Halfwidth Katakana Yu
1990 0xa5e8, // 0x8ed6 : Halfwidth Katakana Yo
1991 0xa5e9, // 0x8ed7 : Halfwidth Katakana Ra
1992 0xa5ea, // 0x8ed8 : Halfwidth Katakana Ri
1993 0xa5eb, // 0x8ed9 : Halfwidth Katakana Ru
1994 0xa5ec, // 0x8eda : Halfwidth Katakana Re
1995 0xa5ed, // 0x8edb : Halfwidth Katakana Ro
1996 0xa5ef, // 0x8edc : Halfwidth Katakana Wa
1997 0xa5f3, // 0x8edd : Halfwidth Katakana N
1998 0xa1ab, // 0x8ede : Halfwidth Katakana Voiced Sound Mark
1999 0xa1ac // 0x8edf : Halfwidth Katakana Semi-Voiced Sound Mark
2003 #endif // FEATURE_CODEPAGES_FILE