Merge pull request #214 from QuickJack/cd2c570c5543963d987f51080218715407c5d4b9
[mono.git] / mcs / class / I18N / CJK / GB18030Encoding.cs
1 //
2 // GB18030Encoding.cs
3 //
4 // Author:
5 //      Atsushi Enomoto  <atsushi@ximian.com>
6 //
7 using System;
8 using System.Reflection;
9 using System.Text;
10 using I18N.Common;
11
12 #if DISABLE_UNSAFE
13 using MonoEncoder = I18N.Common.MonoSafeEncoder;
14 using MonoEncoding = I18N.Common.MonoSafeEncoding;
15 #endif
16
17 namespace I18N.CJK
18 {
19         [Serializable]
20         internal class ENCgb18030 : GB18030Encoding
21         {
22                 public ENCgb18030 (): base () {}
23         }
24
25         [Serializable]
26         public class CP54936 : GB18030Encoding { }
27
28         [Serializable]
29         public class GB18030Encoding : MonoEncoding
30         {
31                 // Constructor.
32                 public GB18030Encoding ()
33                         : base (54936, 936)
34                 {
35                 }
36
37                 public override string EncodingName {
38                         get { return "Chinese Simplified (GB18030)"; }
39                 }
40
41                 public override string HeaderName {
42                         get { return "GB18030"; }
43                 }
44
45                 public override string BodyName {
46                         get { return "GB18030"; }
47                 }
48
49                 public override string WebName {
50                         get { return "GB18030"; }
51                 }
52
53                 public override bool IsMailNewsDisplay {
54                         get { return true; }
55                 }
56
57                 public override bool IsMailNewsSave {
58                         get { return true; }
59                 }
60
61                 public override bool IsBrowserDisplay {
62                         get { return true; }
63                 }
64
65                 public override bool IsBrowserSave {
66                         get { return true; }
67                 }
68
69                 public override int GetMaxByteCount (int len)
70                 {
71                         // non-GB2312 characters in \u0080 - \uFFFF
72                         return len * 4;
73                 }
74
75                 public override int GetMaxCharCount (int len)
76                 {
77                         return len;
78                 }
79
80 #if !DISABLE_UNSAFE
81                 public unsafe override int GetByteCountImpl (char* chars, int count)
82                 {
83                         return new GB18030Encoder (this).GetByteCountImpl (chars, count, true);
84                 }
85
86                 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount)
87                 {
88                         return new GB18030Encoder (this).GetBytesImpl (chars, charCount, bytes, byteCount, true);
89                 }
90 #else
91                 public override int GetByteCount (char [] chars, int index, int length)
92                 {
93                         return new GB18030Encoder (this).GetByteCount (chars, index, length, true);
94                 }
95
96                 public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)
97                 {
98                         return new GB18030Encoder (this).GetBytes (chars, charIndex, charCount, bytes, byteIndex, true);
99                 }
100 #endif
101
102                 public override int GetCharCount (byte [] bytes, int start, int len)
103                 {
104                         return new GB18030Decoder ().GetCharCount (bytes, start, len);
105                 }
106
107                 public override int GetChars (byte [] bytes, int byteIdx, int srclen, char [] chars, int charIdx)
108                 {
109                         return new GB18030Decoder ().GetChars (bytes, byteIdx, srclen, chars, charIdx);
110                 }
111
112                 public override Encoder GetEncoder ()
113                 {
114                         return new GB18030Encoder (this);
115                 }
116
117                 public override Decoder GetDecoder ()
118                 {
119                         return new GB18030Decoder ();
120                 }
121         }
122
123         class GB18030Decoder : DbcsEncoding.DbcsDecoder
124         {
125                 static DbcsConvert gb2312 = DbcsConvert.Gb2312;
126                 // for now incomplete block is not supported - should we?
127                 // int incomplete1 = -1, incomplete2 = -1, incomplete3 = -1;
128
129                 public GB18030Decoder ()
130                         : base (null)
131                 {
132                 }
133
134                 public override int GetCharCount (byte [] bytes, int start, int len)
135                 {
136                         CheckRange (bytes, start, len);
137
138                         int end = start + len;
139                         int ret = 0;
140                         while (start < end) {
141                                 if (bytes [start] < 0x80) {
142                                         ret++;
143                                         start++;
144                                         continue;
145                                 }
146                                 else if (bytes [start] == 0x80) {
147                                         // Euro sign - actually it is obsolete,
148                                         // now it's just reserved but not used
149                                         ret++;
150                                         start++;
151                                         continue;
152                                 }
153                                 else if (bytes [start] == 0xFF) {
154                                         // invalid data - fill '?'
155                                         ret++;
156                                         start++;
157                                         continue;
158                                 }
159                                 else if (start + 1 >= end) {
160 //                                      incomplete1 = bytes [start];
161 //                                      incomplete2 = -1;
162 //                                      incomplete3 = -1;
163                                         ret++;
164                                         break; // incomplete tail.
165                                 }
166
167                                 byte second = bytes [start + 1];
168                                 if (second == 0x7F || second == 0xFF) {
169                                         // invalid data
170                                         ret++;
171                                         start += 2;
172                                         continue;
173                                 }
174                                 else if (0x30 <= second && second <= 0x39) {
175                                         // UCS mapping
176                                         if (start + 3 >= end) {
177                                                 // incomplete tail.
178 //                                              incomplete1 = bytes [start];
179 //                                              incomplete2 = bytes [start + 1];
180 //                                              if (start + 3 == end)
181 //                                                      incomplete3 = bytes [start + 2];
182                                                 ret += start + 3 == end ? 3 : 2;
183                                                 break;
184                                         }
185                                         long value = GB18030Source.FromGBX (bytes, start);
186                                         if (value < 0) {
187                                                 // invalid data.
188                                                 ret++;
189                                                 start -= (int) value;
190                                         } else if (value >= 0x10000) {
191                                                 // UTF16 surrogate
192                                                 ret += 2;
193                                                 start += 4;
194                                         } else {
195                                                 // UTF16 BMP
196                                                 ret++;
197                                                 start+= 4;
198                                         }
199                                 } else {
200                                         // GB2312 mapping
201                                         start += 2;
202                                         ret++;
203                                 }
204                         }
205                         return ret;
206                 }
207
208                 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
209                 {
210                         CheckRange (bytes, byteIndex, byteCount, chars, charIndex);
211
212                         int byteEnd = byteIndex + byteCount;
213                         int charStart = charIndex;
214
215                         while (byteIndex < byteEnd) {
216                                 if (bytes [byteIndex] < 0x80) {
217                                         chars [charIndex++] = (char) bytes [byteIndex++];
218                                         continue;
219                                 }
220                                 else if (bytes [byteIndex] == 0x80) {
221                                         // Euro sign - actually it is obsolete,
222                                         // now it's just reserved but not used
223                                         chars [charIndex++] = '\u20AC';
224                                         byteIndex++;
225                                         continue;
226                                 }
227                                 else if (bytes [byteIndex] == 0xFF) {
228                                         // invalid data - fill '?'
229                                         chars [charIndex++] = '?';
230                                         byteIndex++;
231                                         continue;
232                                 }
233                                 else if (byteIndex + 1 >= byteEnd) {
234                                         //incomplete1 = bytes [byteIndex++];
235                                         //incomplete2 = -1;
236                                         //incomplete3 = -1;
237                                         break; // incomplete tail.
238                                 }
239
240                                 byte second = bytes [byteIndex + 1];
241                                 if (second == 0x7F || second == 0xFF) {
242                                         // invalid data
243                                         chars [charIndex++] = '?';
244                                         byteIndex += 2;
245                                 }
246                                 else if (0x30 <= second && second <= 0x39) {
247                                         // UCS mapping
248                                         if (byteIndex + 3 >= byteEnd) {
249                                                 // incomplete tail.
250                                                 //incomplete1 = bytes [byteIndex];
251                                                 //incomplete2 = bytes [byteIndex + 1];
252                                                 //if (byteIndex + 3 == byteEnd)
253                                                 //      incomplete3 = bytes [byteIndex + 2];
254                                                 break;
255                                         }
256                                         long value = GB18030Source.FromGBX (bytes, byteIndex);
257                                         if (value < 0) {
258                                                 // invalid data.
259                                                 chars [charIndex++] = '?';
260                                                 byteIndex -= (int) value;
261                                         } else if (value >= 0x10000) {
262                                                 // UTF16 surrogate
263                                                 value -= 0x10000;
264                                                 chars [charIndex++] = (char) (value / 0x400 + 0xD800);
265                                                 chars [charIndex++] = (char) (value % 0x400 + 0xDC00);
266                                                 byteIndex += 4;
267                                         } else {
268                                                 // UTF16 BMP
269                                                 chars [charIndex++] = (char) value;
270                                                 byteIndex += 4;
271                                         }
272                                 } else {
273                                         byte first = bytes [byteIndex];
274                                         int ord = ((first - 0x81) * 191 + second - 0x40) * 2;
275                                         char c1 = ord < 0 || ord >= gb2312.n2u.Length ?
276                                                 '\0' : (char) (gb2312.n2u [ord] + gb2312.n2u [ord + 1] * 256);
277                                         if (c1 == 0)
278                                                 chars [charIndex++] = '?';
279                                         else
280                                                 chars [charIndex++] = c1;
281                                         byteIndex += 2;
282                                 }
283                         }
284
285                         return charIndex - charStart;
286                 }
287         }
288
289         class GB18030Encoder : MonoEncoder
290         {
291                 static DbcsConvert gb2312 = DbcsConvert.Gb2312;
292
293                 public GB18030Encoder (MonoEncoding owner)
294                         : base (owner)
295                 {
296                 }
297
298                 char incomplete_byte_count;
299                 char incomplete_bytes;
300
301 #if !DISABLE_UNSAFE
302                 public unsafe override int GetByteCountImpl (char* chars, int count, bool refresh)
303                 {
304                         int start = 0;
305                         int end = count;
306                         int ret = 0;
307                         while (start < end) {
308                                 char ch = chars [start];
309                                 if (ch < 0x80) {
310                                         // ASCII
311                                         ret++;
312                                         start++;
313                                         continue;
314                                 } else if (Char.IsSurrogate (ch)) {
315                                         // Surrogate
316                                         if (start + 1 == end) {
317                                                 incomplete_byte_count = ch;
318                                                 start++;
319                                         } else {
320                                                 ret += 4;
321                                                 start += 2;
322                                         }
323                                         continue;
324                                 }
325
326                                 if (ch < 0x80 || ch == 0xFF) {
327                                         // ASCII
328                                         ret++;
329                                         start++;
330                                         continue;
331                                 }
332
333                                 byte b1 = gb2312.u2n [((int) ch) * 2 + 1];
334                                 byte b2 = gb2312.u2n [((int) ch) * 2];
335                                 if (b1 != 0 && b2 != 0) {
336                                         // GB2312
337                                         ret += 2;
338                                         start++;
339                                         continue;
340                                 }
341
342                                 // non-GB2312
343                                 long value = GB18030Source.FromUCS (ch);
344                                 if (value < 0)
345                                         ret++; // invalid(?)
346                                 else
347                                         ret += 4;
348                                 start++;
349                         }
350
351                         if (refresh) {
352                                 if (incomplete_byte_count != char.MinValue)
353                                         ret++;
354                                 incomplete_byte_count = char.MinValue;
355                         }
356                         return ret;
357                 }
358
359                 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount, bool refresh)
360                 {
361                         int charIndex = 0;
362                         int byteIndex = 0;
363
364                         int charEnd = charIndex + charCount;
365                         int byteStart = byteIndex;
366                         char ch = incomplete_bytes;
367
368                         while (charIndex < charEnd) {
369                                 if (incomplete_bytes == char.MinValue)
370                                         ch = chars [charIndex++];
371                                 else
372                                         incomplete_bytes = char.MinValue;
373
374                                 if (ch < 0x80) {
375                                         // ASCII
376                                         bytes [byteIndex++] = (byte) ch;
377                                         continue;
378                                 } else if (Char.IsSurrogate (ch)) {
379                                         // Surrogate
380                                         if (charIndex == charEnd) {
381                                                 incomplete_bytes = ch;
382                                                 break; // incomplete
383                                         }
384                                         char ch2 = chars [charIndex++];
385                                         if (!Char.IsSurrogate (ch2)) {
386                                                 // invalid surrogate
387 #if NET_2_0
388                                                 HandleFallback (
389                                                         chars, ref charIndex, ref charCount,
390                                                         bytes, ref byteIndex, ref byteCount, null);
391 #else
392                                                 bytes [byteIndex++] = (byte) '?';
393 #endif
394                                                 continue;
395                                         }
396                                         int cp = (ch - 0xD800) * 0x400 + ch2 - 0xDC00;
397                                         GB18030Source.Unlinear (bytes + byteIndex, GB18030Source.FromUCSSurrogate (cp));
398                                         byteIndex += 4;
399                                         continue;
400                                 }
401
402
403                                 if (ch <= 0x80 || ch == 0xFF) {
404                                         // Character maps to itself
405                                         bytes [byteIndex++] = (byte) ch;
406                                         continue;
407                                 }
408
409                                 byte b1 = gb2312.u2n [((int) ch) * 2 + 1];
410                                 byte b2 = gb2312.u2n [((int) ch) * 2];
411                                 if (b1 != 0 && b2 != 0) {
412                                         bytes [byteIndex++] = b1;
413                                         bytes [byteIndex++] = b2;
414                                         continue;
415                                 }
416
417                                 long value = GB18030Source.FromUCS (ch);
418                                 if (value < 0)
419                                         bytes [byteIndex++] = 0x3F; // invalid(?)
420                                 else {
421                                         // non-GB2312
422                                         GB18030Source.Unlinear (bytes + byteIndex, value);
423                                         byteIndex += 4;
424                                 }
425                         }
426
427                         if (refresh) {
428                                 if (incomplete_bytes != char.MinValue)
429                                         bytes [byteIndex++] = 0x3F; // incomplete
430                                 incomplete_bytes = char.MinValue;
431                         }
432
433                         return byteIndex - byteStart;
434                 }
435 #else
436
437                 public override int GetByteCount(char[] chars, int index, int count, bool refresh)
438                 {
439                         int start = 0;
440                         int end = count;
441                         int ret = 0;
442                         while (start < end)
443                         {
444                                 char ch = chars[start];
445                                 if (ch < 0x80)
446                                 {
447                                         // ASCII
448                                         ret++;
449                                         start++;
450                                         continue;
451                                 }
452                                 else if (Char.IsSurrogate(ch))
453                                 {
454                                         // Surrogate
455                                         if (start + 1 == end)
456                                         {
457                                                 incomplete_byte_count = ch;
458                                                 start++;
459                                         }
460                                         else
461                                         {
462                                                 ret += 4;
463                                                 start += 2;
464                                         }
465                                         continue;
466                                 }
467
468                                 if (ch < 0x80 || ch == 0xFF)
469                                 {
470                                         // ASCII
471                                         ret++;
472                                         start++;
473                                         continue;
474                                 }
475
476                                 byte b1 = gb2312.u2n[((int)ch) * 2 + 1];
477                                 byte b2 = gb2312.u2n[((int)ch) * 2];
478                                 if (b1 != 0 && b2 != 0)
479                                 {
480                                         // GB2312
481                                         ret += 2;
482                                         start++;
483                                         continue;
484                                 }
485
486                                 // non-GB2312
487                                 long value = GB18030Source.FromUCS(ch);
488                                 if (value < 0)
489                                         ret++; // invalid(?)
490                                 else
491                                         ret += 4;
492                                 start++;
493                         }
494
495                         if (refresh)
496                         {
497                                 if (incomplete_byte_count != char.MinValue)
498                                         ret++;
499                                 incomplete_byte_count = char.MinValue;
500                         }
501                         return ret;
502                 }
503
504                 public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool refresh)
505                 {
506                         int byteCount = bytes.Length;
507                         int charEnd = charIndex + charCount;
508                         int byteStart = byteIndex;
509                         char ch = incomplete_bytes;
510
511                         while (charIndex < charEnd)
512                         {
513                                 if (incomplete_bytes == char.MinValue)
514                                         ch = chars[charIndex++];
515                                 else
516                                         incomplete_bytes = char.MinValue;
517
518                                 if (ch < 0x80)
519                                 {
520                                         // ASCII
521                                         bytes[byteIndex++] = (byte)ch;
522                                         continue;
523                                 }
524                                 else if (Char.IsSurrogate(ch))
525                                 {
526                                         // Surrogate
527                                         if (charIndex == charEnd)
528                                         {
529                                                 incomplete_bytes = ch;
530                                                 break; // incomplete
531                                         }
532                                         char ch2 = chars[charIndex++];
533                                         if (!Char.IsSurrogate(ch2))
534                                         {
535                                                 // invalid surrogate
536 #if NET_2_0
537                                                 HandleFallback (chars, ref charIndex, ref charCount,
538                                                         bytes, ref byteIndex, ref byteCount, null);
539 #else
540                                                 bytes [byteIndex++] = (byte) '?';
541 #endif
542                                                 continue;
543                                         }
544                                         int cp = (ch - 0xD800) * 0x400 + ch2 - 0xDC00;
545                                         GB18030Source.Unlinear(bytes,  byteIndex, GB18030Source.FromUCSSurrogate(cp));
546                                         byteIndex += 4;
547                                         continue;
548                                 }
549
550
551                                 if (ch <= 0x80 || ch == 0xFF)
552                                 {
553                                         // Character maps to itself
554                                         bytes[byteIndex++] = (byte)ch;
555                                         continue;
556                                 }
557
558                                 byte b1 = gb2312.u2n[((int)ch) * 2 + 1];
559                                 byte b2 = gb2312.u2n[((int)ch) * 2];
560                                 if (b1 != 0 && b2 != 0)
561                                 {
562                                         bytes[byteIndex++] = b1;
563                                         bytes[byteIndex++] = b2;
564                                         continue;
565                                 }
566
567                                 long value = GB18030Source.FromUCS(ch);
568                                 if (value < 0)
569                                         bytes[byteIndex++] = 0x3F; // invalid(?)
570                                 else
571                                 {
572                                         // non-GB2312
573                                         GB18030Source.Unlinear(bytes, byteIndex, value);
574                                         byteIndex += 4;
575                                 }
576                         }
577
578                         if (refresh)
579                         {
580                                 if (incomplete_bytes != char.MinValue)
581                                         bytes[byteIndex++] = 0x3F; // incomplete
582                                 incomplete_bytes = char.MinValue;
583                         }
584
585                         return byteIndex - byteStart;
586                 }
587 #endif
588         }
589 }