* roottypes.cs: Rename from tree.cs.
[mono.git] / mcs / class / I18N / CJK / CP51932.cs
1 /*
2  * CP51932.cs - Japanese EUC-JP code page.
3  *
4  * It is based on CP932.cs from Portable.NET
5  *
6  * Author:
7  *      Atsushi Enomoto <atsushi@ximian.com>
8  *
9  * Below are original (CP932.cs) copyright lines
10  *
11  * (C)2004 Novell Inc.
12  *
13  * Copyright (c) 2002  Southern Storm Software, Pty Ltd
14  *
15  * Permission is hereby granted, free of charge, to any person obtaining
16  * a copy of this software and associated documentation files (the "Software"),
17  * to deal in the Software without restriction, including without limitation
18  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
19  * and/or sell copies of the Software, and to permit persons to whom the
20  * Software is furnished to do so, subject to the following conditions:
21  *
22  * The above copyright notice and this permission notice shall be included
23  * in all copies or substantial portions of the Software.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
26  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
27  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
28  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
29  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
30  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
31  * OTHER DEALINGS IN THE SOFTWARE.
32  */
33
34 /*
35
36         Well, there looks no jis.table source. Thus, it seems like it is 
37         generated from text files from Unicode Home Page such like
38         ftp://ftp.unicode.org/Public/MAPPINGS/OBSOLETE/EASTASIA/JIS/JIS0208.TXT
39         However, it is non-normative and in Japan it is contains many problem.
40
41         FIXME:  Some characters such as 0xFF0B (wide "plus") are missing in
42                 that table.
43 */
44
45 /*
46         0x00-0x1F, 0x7F   : control characters
47         0x20-0x7E         : ASCII
48         0xA1A1-0xFEFE     : Kanji (precisely, both bytes contain only A1-FE)
49         0x8EA1-0x8EDF     : half-width Katakana
50         0x8FA1A1-0x8FFEFE : Complemental Kanji
51
52 */
53
54 namespace I18N.CJK
55 {
56
57 using System;
58 using System.Text;
59 using I18N.Common;
60
61 [Serializable]
62 public class CP51932 : MonoEncoding
63 {
64         // Magic number used by Windows for the EUC-JP code page.
65         private const int EUC_JP_CODE_PAGE = 51932;
66
67         // Constructor.
68         public CP51932 () : base (EUC_JP_CODE_PAGE, 932)
69         {
70         }
71
72
73         public override int GetByteCount (char [] chars, int index, int length)
74         {
75                 return new CP51932Encoder (this).GetByteCount (chars, index, length, true);
76         }
77
78         public unsafe override int GetByteCountImpl (char* chars, int count)
79         {
80                 return new CP51932Encoder (this).GetByteCountImpl (chars, count, true);
81         }
82
83         public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount)
84         {
85                 return new CP51932Encoder (this).GetBytesImpl (chars, charCount, bytes, byteCount, true);
86         }
87
88         public override int GetCharCount (byte [] bytes, int index, int count)
89         {
90 #if NET_2_0
91                 return new CP51932Decoder ().GetCharCount (
92                         bytes, index, count, true);
93 #else
94                 return new CP51932Decoder ().GetCharCount (
95                         bytes, index, count);
96 #endif
97         }
98
99         public override int GetChars (
100                 byte [] bytes, int byteIndex, int byteCount,
101                 char [] chars, int charIndex)
102         {
103 #if NET_2_0
104                 return new CP51932Decoder ().GetChars (bytes,
105                         byteIndex, byteCount, chars, charIndex, true);
106 #else
107                 return new CP51932Decoder ().GetChars (bytes,
108                         byteIndex, byteCount, chars, charIndex);
109 #endif
110         }
111
112         // Get the maximum number of bytes needed to encode a
113         // specified number of characters.
114         public override int GetMaxByteCount(int charCount)
115         {
116                 if(charCount < 0)
117                 {
118                         throw new ArgumentOutOfRangeException
119                                 ("charCount",
120                                  Strings.GetString("ArgRange_NonNegative"));
121                 }
122                 return charCount * 3;
123         }
124
125         // Get the maximum number of characters needed to decode a
126         // specified number of bytes.
127         public override int GetMaxCharCount(int byteCount)
128         {
129                 if(byteCount < 0)
130                 {
131                         throw new ArgumentOutOfRangeException
132                                 ("byteCount",
133                                  Strings.GetString ("ArgRange_NonNegative"));
134                 }
135                 return byteCount;
136         }
137
138         public override Encoder GetEncoder ()
139         {
140                 return new CP51932Encoder (this);
141         }
142
143         public override Decoder GetDecoder ()
144         {
145                 return new CP51932Decoder ();
146         }
147
148 #if !ECMA_COMPAT
149
150         // Get the mail body name for this encoding.
151         public override String BodyName {
152                 get { return "euc-jp"; }
153         }
154
155         // Get the human-readable name for this encoding.
156         public override String EncodingName {
157                 get { return "Japanese (EUC)"; }
158         }
159
160         // Get the mail agent header name for this encoding.
161         public override String HeaderName {
162                 get { return "euc-jp"; }
163         }
164
165         // Determine if this encoding can be displayed in a Web browser.
166         public override bool IsBrowserDisplay {
167                 get { return true; }
168         }
169
170         // Determine if this encoding can be saved from a Web browser.
171         public override bool IsBrowserSave {
172                 get { return true; }
173         }
174
175         // Determine if this encoding can be displayed in a mail/news agent.
176         public override bool IsMailNewsDisplay {
177                 get { return true; }
178         }
179
180         // Determine if this encoding can be saved from a mail/news agent.
181         public override bool IsMailNewsSave {
182                 get { return true; }
183         }
184
185         // Get the IANA-preferred Web name for this encoding.
186         public override String WebName {
187                 get { return "euc-jp"; }
188         }
189 } // CP51932
190 #endif // !ECMA_COMPAT
191
192 public class CP51932Encoder : MonoEncoder
193 {
194         public CP51932Encoder (MonoEncoding encoding)
195                 : base (encoding)
196         {
197         }
198
199         // Get the number of bytes needed to encode a character buffer.
200         public unsafe override int GetByteCountImpl (
201                 char* chars, int count, bool refresh)
202         {
203                 // Determine the length of the final output.
204                 int index = 0;
205                 int length = 0;
206                 int ch, value;
207                 byte [] cjkToJis = JISConvert.Convert.cjkToJis;
208                 byte [] extraToJis = JISConvert.Convert.extraToJis;
209
210                 while (count > 0) {
211                         ch = chars [index++];
212                         --count;
213                         ++length;
214                         if (ch < 0x0080) {
215                                 // Character maps to itself.
216                                 continue;
217                         } else if (ch < 0x0100) {
218                                 // Check for special Latin 1 characters that
219                                 // can be mapped to double-byte code points.
220                                 if(ch == 0x00A2 || ch == 0x00A3 || ch == 0x00A7 ||
221                                    ch == 0x00A8 || ch == 0x00AC || ch == 0x00B0 ||
222                                    ch == 0x00B1 || ch == 0x00B4 || ch == 0x00B6 ||
223                                    ch == 0x00D7 || ch == 0x00F7)
224                                 {
225                                         ++length;
226                                 }
227                         } else if (ch >= 0x0391 && ch <= 0x0451) {
228                                 // Greek subset characters.
229                                 ++length;
230                         } else if (ch >= 0x2010 && ch <= 0x9FA5) {
231                                 // This range contains the bulk of the CJK set.
232                                 value = (ch - 0x2010) * 2;
233                                 value = ((int) (cjkToJis[value])) | (((int)(cjkToJis[value + 1])) << 8);
234                                 if(value >= 0x0100)
235                                         ++length;
236                         } else if(ch >= 0xFF01 && ch < 0xFF60) {
237                                 // This range contains extra characters.
238                                 value = (ch - 0xFF01) * 2;
239                                 value = ((int)(extraToJis[value])) |
240                                                 (((int)(extraToJis[value + 1])) << 8);
241                                 if(value >= 0x0100)
242                                         ++length;
243                         } else if(ch >= 0xFF60 && ch <= 0xFFA0) {
244                                 ++length; // half-width kana
245                         }
246                 }
247
248                 // Return the length to the caller.
249                 return length;
250         }
251
252         // Get the bytes that result from encoding a character buffer.
253         public unsafe override int GetBytesImpl (
254                 char* chars, int charCount, byte* bytes, int byteCount, bool refresh)
255         {
256                 int charIndex = 0;
257                 int byteIndex = 0;
258
259                 // Convert the characters into their byte form.
260                 int posn = byteIndex;
261                 int byteLength = byteCount;
262                 int ch, value;
263
264                 byte[] cjkToJis = JISConvert.Convert.cjkToJis;
265                 byte[] greekToJis = JISConvert.Convert.greekToJis;
266                 byte[] extraToJis = JISConvert.Convert.extraToJis;
267
268                 for (; charCount > 0; charIndex++, --charCount) {
269                         ch = chars [charIndex];
270                         if (posn >= byteLength) {
271                                 throw new ArgumentException (Strings.GetString ("Arg_InsufficientSpace"), "bytes");
272                         }
273
274                         if (ch < 0x0080) {
275                                 // Character maps to itself.
276                                 bytes[posn++] = (byte)ch;
277                                 continue;
278                         } else if (ch >= 0x0391 && ch <= 0x0451) {
279                                 // Greek subset characters.
280                                 value = (ch - 0x0391) * 2;
281                                 value = ((int)(greekToJis[value])) |
282                                                 (((int)(greekToJis[value + 1])) << 8);
283                         } else if (ch >= 0x2010 && ch <= 0x9FA5) {
284                                 // This range contains the bulk of the CJK set.
285                                 value = (ch - 0x2010) * 2;
286                                 value = ((int) (cjkToJis[value])) |
287                                                 (((int)(cjkToJis[value + 1])) << 8);
288                         } else if (ch >= 0xFF01 && ch <= 0xFF60) {
289                                 // This range contains extra characters,
290                                 // including half-width katakana.
291                                 value = (ch - 0xFF01) * 2;
292                                 value = ((int) (extraToJis [value])) |
293                                                 (((int) (extraToJis [value + 1])) << 8);
294                         } else if (ch >= 0xFF60 && ch <= 0xFFA0) {
295                                 value = ch - 0xFF60 + 0x8EA0;
296                         } else {
297                                 // Invalid character.
298                                 value = 0;
299                         }
300
301                         if (value == 0) {
302 #if NET_2_0
303                                 HandleFallback (
304                                         chars, ref charIndex, ref charCount,
305                                         bytes, ref posn, ref byteCount);
306 #else
307                                 bytes [posn++] = (byte) '?';
308 #endif
309                         } else if (value < 0x0100) {
310                                 bytes [posn++] = (byte) value;
311                         } else if ((posn + 1) >= byteLength) {
312                                 throw new ArgumentException (Strings.GetString ("Arg_InsufficientSpace"), "bytes");
313                         } else if (value < 0x8000) {
314                                 // general 2byte glyph/kanji
315                                 value -= 0x0100;
316                                 bytes [posn++] = (byte) (value / 0x5E + 0xA1);
317                                 bytes [posn++] = (byte) (value % 0x5E + 0xA1);
318 //Console.WriteLine ("{0:X04}", ch);
319                                 continue;
320                         }
321                         else
322                         {
323                                 // half-width kana
324                                 bytes [posn++] = 0x8E;
325                                 bytes [posn++] = (byte) (value - 0x8E00);
326                         }
327                 }
328
329                 // Return the final length to the caller.
330                 return posn - byteIndex;
331         }
332 } // CP51932Encoder
333
334 internal class CP51932Decoder : DbcsEncoding.DbcsDecoder
335 {
336         public CP51932Decoder ()
337                 : base (null)
338         {
339         }
340
341         int last_count, last_bytes;
342
343         // Get the number of characters needed to decode a byte buffer.
344         public override int GetCharCount (byte [] bytes, int index, int count)
345         {
346                 return GetCharCount (bytes, index, count, false);
347         }
348
349 #if NET_2_0
350         public override
351 #else
352         internal
353 #endif
354         int GetCharCount (byte [] bytes, int index, int count, bool refresh)
355         {
356                 CheckRange (bytes, index, count);
357
358                 // Determine the total length of the converted string.
359                 int value = 0;
360                 byte[] table0208 = JISConvert.Convert.jisx0208ToUnicode;
361                 byte[] table0212 = JISConvert.Convert.jisx0212ToUnicode;
362                 int length = 0;
363                 int byteval = 0;
364                 int last = last_count;
365
366                 while (count > 0) {
367                         byteval = bytes [index++];
368                         --count;
369                         if (last == 0) {
370                                 if (byteval == 0x8F) {
371                                         if (byteval != 0) {
372                                                 // Invalid second byte of a 3-byte character.
373                                                 last = 0;
374                                                 length++;
375                                         }
376                                         // First byte in a triple-byte sequence
377                                         else
378                                                 last = byteval;
379                                 } else if (byteval <= 0x7F) {
380                                         // Ordinary ASCII/Latin1/Control character.
381                                         length++;
382                                 } else if (byteval == 0x8E) {
383                                         // First byte of half-width Katakana
384                                         last = byteval;
385                                 } else if (byteval >= 0xA1 && byteval <= 0xFE) {
386                                         // First byte in a double-byte sequence.
387                                         last = byteval;
388                                 } else {
389                                         // Invalid first byte.
390                                         length++;
391                                 }
392                         }
393                         else if (last == 0x8E) {
394                                 if (byteval >= 0xA1 && byteval <= 0xDF) {
395                                         value = ((byteval - 0x40) |
396                                                 (last + 0x71) << 8);
397                                         length++;
398                                 } else {
399                                         // Invalid second byte.
400                                         length++;
401                                 }
402                                 last =0;
403                         }
404                         else if (last == 0x8F) {
405                                 // 3-byte character
406                                 // FIXME: currently not supported yet
407                                 last = byteval;
408                         }
409                         else
410                         {
411                                 // Second byte in a double-byte sequence.
412                                 value = (last - 0xA1) * 0x5E;
413                                 last = 0;
414                                 if (byteval >= 0xA1 && byteval <= 0xFE)
415                                 {
416                                         value += (byteval - 0xA1);
417                                 }
418                                 else
419                                 {
420                                         // Invalid second byte.
421                                         last = 0;
422                                         length++;
423                                         continue;
424                                 }
425
426                                 value *= 2;
427                                 value = ((int) (table0208 [value]))
428                                         | (((int) (table0208 [value + 1])) << 8);
429                                 if (value == 0)
430                                         value = ((int) (table0212 [value]))
431                                                 | (((int) (table0212 [value + 1])) << 8);
432                                 if (value != 0)
433                                         length++;
434                                 else
435                                         length++;
436                         }
437                 }
438
439                 // seems like .NET 2.0 adds \u30FB for insufficient
440                 // byte seuqence (for Japanese \u30FB makes sense).
441                 if (refresh && last != 0)
442                         length++;
443                 else
444                         last_count = last;
445
446                 // Return the final length to the caller.
447                 return length;
448         }
449
450         public override int GetChars (byte[] bytes, int byteIndex,
451                                                  int byteCount, char[] chars,
452                                                  int charIndex)
453         {
454                 return GetChars (bytes, byteIndex, byteCount, chars, charIndex, false);
455         }
456
457 #if NET_2_0
458         public override
459 #else
460         internal
461 #endif
462         int GetChars (byte[] bytes, int byteIndex,
463                                                  int byteCount, char[] chars,
464                                                  int charIndex, bool refresh)
465         {
466                 CheckRange (bytes, byteIndex, byteCount, chars, charIndex);
467
468                 // Decode the bytes in the buffer.
469                 int posn = charIndex;
470                 int charLength = chars.Length;
471                 int byteval, value;
472                 int last = last_bytes;
473                 byte[] table0208 = JISConvert.Convert.jisx0208ToUnicode;
474                 byte[] table0212 = JISConvert.Convert.jisx0212ToUnicode;
475
476                 while (byteCount > 0) {
477                         byteval = bytes [byteIndex++];
478                         --byteCount;
479                         if (last == 0) {
480                                 if (byteval == 0x8F) {
481                                         if (byteval != 0) {
482                                                 // Invalid second byte of a 3-byte character.
483                                                 last = 0;
484                                                 if (posn >= charLength)
485                                                         throw Insufficient ();
486                                                 chars [posn++] = '\u30FB';
487                                         }
488                                         // First byte in a triple-byte sequence
489                                         else
490                                                 last = byteval;
491                                 } else if (byteval <= 0x7F) {
492                                         // Ordinary ASCII/Latin1/Control character.
493                                         if (posn >= charLength)
494                                                 throw Insufficient ();
495                                         chars [posn++] = (char) byteval;
496                                 } else if (byteval == 0x8E) {
497                                         // First byte of half-width Katakana
498                                         last = byteval;
499                                 } else if (byteval >= 0xA1 && byteval <= 0xFE) {
500                                         // First byte in a double-byte sequence.
501                                         last = byteval;
502                                 } else {
503                                         // Invalid first byte.
504                                         if (posn >= charLength)
505                                                 throw Insufficient ();
506                                         chars [posn++] = '\u30FB';
507                                 }
508                         }
509                         else if (last == 0x8E) {
510                                 if (byteval >= 0xA1 && byteval <= 0xDF) {
511                                         value = ((byteval - 0x40) |
512                                                 (last + 0x71) << 8);
513                                         if (posn >= charLength)
514                                                 throw Insufficient ();
515                                         chars [posn++] = (char) value;
516                                 } else {
517                                         // Invalid second byte.
518                                         if (posn >= charLength)
519                                                 throw Insufficient ();
520                                         chars [posn++] = '\u30FB';
521                                 }
522                                 last =0;
523                         }
524                         else if (last == 0x8F) {
525                                 // 3-byte character
526                                 // FIXME: currently not supported yet
527                                 last = byteval;
528                         }
529                         else
530                         {
531                                 // Second byte in a double-byte sequence.
532                                 value = (last - 0xA1) * 0x5E;
533                                 last = 0;
534                                 if (byteval >= 0xA1 && byteval <= 0xFE)
535                                 {
536                                         value += (byteval - 0xA1);
537                                 }
538                                 else
539                                 {
540                                         // Invalid second byte.
541                                         last = 0;
542                                         if (posn >= charLength)
543                                                 throw Insufficient ();
544                                         chars [posn++] = '\u30FB';
545                                         continue;
546                                 }
547
548                                 value *= 2;
549                                 value = ((int) (table0208 [value]))
550                                         | (((int) (table0208 [value + 1])) << 8);
551                                 if (value == 0)
552                                         value = ((int) (table0212 [value]))
553                                                 | (((int) (table0212 [value + 1])) << 8);
554                                 if (posn >= charLength)
555                                         throw Insufficient ();
556                                 if (value != 0)
557                                         chars [posn++] = (char)value;
558                                 else
559                                         chars [posn++] = '\u30FB';
560                         }
561                 }
562
563                 if (refresh && last != 0) {
564                         // seems like .NET 2.0 adds \u30FB for insufficient
565                         // byte seuqence (for Japanese \u30FB makes sense).
566                         if (posn >= charLength)
567                                 throw Insufficient ();
568                         chars [posn++] = '\u30FB';
569                 }
570                 else
571                         last_bytes = last;
572
573                 // Return the final length to the caller.
574                 return posn - charIndex;
575         }
576
577         Exception Insufficient ()
578         {
579                 throw new ArgumentException
580                         (Strings.GetString
581                                 ("Arg_InsufficientSpace"), "chars");
582         }
583 }; // class CP51932Decoder
584
585 [Serializable]
586 public class ENCeuc_jp : CP51932
587 {
588         public ENCeuc_jp () : base() {}
589
590 }; // class ENCeucjp
591
592 }; // namespace I18N.CJK