In corlib/System.Runtime.InteropServices:
[mono.git] / mcs / class / I18N / CJK / GB18030Encoding.cs
1 //
2 // GB18030Encoding.cs
3 //
4 // Author:
5 //      Atsushi Enomoto  <atsushi@ximian.com>
6 //
7 using System;
8 using System.Reflection;
9 using System.Text;
10 using I18N.Common;
11
12 namespace I18N.CJK
13 {
14         [Serializable]
15         internal class ENCgb18030 : GB18030Encoding
16         {
17                 public ENCgb18030 (): base () {}
18         }
19
20         [Serializable]
21         public class CP54936 : GB18030Encoding { }
22
23         [Serializable]
24         public class GB18030Encoding : MonoEncoding
25         {
26                 // Constructor.
27                 public GB18030Encoding ()
28                         : base (54936, 936)
29                 {
30                 }
31
32                 public override string EncodingName {
33                         get { return "Chinese Simplified (GB18030)"; }
34                 }
35
36                 public override string HeaderName {
37                         get { return "GB18030"; }
38                 }
39
40                 public override string BodyName {
41                         get { return "GB18030"; }
42                 }
43
44                 public override string WebName {
45                         get { return "GB18030"; }
46                 }
47
48                 public override bool IsMailNewsDisplay {
49                         get { return true; }
50                 }
51
52                 public override bool IsMailNewsSave {
53                         get { return true; }
54                 }
55
56                 public override bool IsBrowserDisplay {
57                         get { return true; }
58                 }
59
60                 public override bool IsBrowserSave {
61                         get { return true; }
62                 }
63
64                 public override int GetMaxByteCount (int len)
65                 {
66                         // non-GB2312 characters in \u0080 - \uFFFF
67                         return len * 4;
68                 }
69
70                 public override int GetMaxCharCount (int len)
71                 {
72                         return len;
73                 }
74
75                 public override int GetByteCount (char [] chars, int index, int length)
76                 {
77                         return new GB18030Encoder (this).GetByteCount (chars, index, length, true);
78                 }
79
80                 public unsafe override int GetByteCountImpl (char* chars, int count)
81                 {
82                         return new GB18030Encoder (this).GetByteCountImpl (chars, count, true);
83                 }
84
85                 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount)
86                 {
87                         return new GB18030Encoder (this).GetBytesImpl (chars, charCount, bytes, byteCount, true);
88                 }
89
90                 public override int GetCharCount (byte [] bytes, int start, int len)
91                 {
92                         return new GB18030Decoder ().GetCharCount (bytes, start, len);
93                 }
94
95                 public override int GetChars (byte [] bytes, int byteIdx, int srclen, char [] chars, int charIdx)
96                 {
97                         return new GB18030Decoder ().GetChars (bytes, byteIdx, srclen, chars, charIdx);
98                 }
99
100                 public override Encoder GetEncoder ()
101                 {
102                         return new GB18030Encoder (this);
103                 }
104
105                 public override Decoder GetDecoder ()
106                 {
107                         return new GB18030Decoder ();
108                 }
109         }
110
111         class GB18030Decoder : DbcsEncoding.DbcsDecoder
112         {
113                 static DbcsConvert gb2312 = DbcsConvert.Gb2312;
114                 // for now incomplete block is not supported - should we?
115                 // int incomplete1 = -1, incomplete2 = -1, incomplete3 = -1;
116
117                 public GB18030Decoder ()
118                         : base (null)
119                 {
120                 }
121
122                 public override int GetCharCount (byte [] bytes, int start, int len)
123                 {
124                         CheckRange (bytes, start, len);
125
126                         int end = start + len;
127                         int ret = 0;
128                         while (start < end) {
129                                 if (bytes [start] < 0x80) {
130                                         ret++;
131                                         start++;
132                                         continue;
133                                 }
134                                 else if (bytes [start] == 0x80) {
135                                         // Euro sign - actually it is obsolete,
136                                         // now it's just reserved but not used
137                                         ret++;
138                                         start++;
139                                         continue;
140                                 }
141                                 else if (bytes [start] == 0xFF) {
142                                         // invalid data - fill '?'
143                                         ret++;
144                                         start++;
145                                         continue;
146                                 }
147                                 else if (start + 1 >= end) {
148 //                                      incomplete1 = bytes [start];
149 //                                      incomplete2 = -1;
150 //                                      incomplete3 = -1;
151                                         ret++;
152                                         break; // incomplete tail.
153                                 }
154
155                                 byte second = bytes [start + 1];
156                                 if (second == 0x7F || second == 0xFF) {
157                                         // invalid data
158                                         ret++;
159                                         start += 2;
160                                         continue;
161                                 }
162                                 else if (0x30 <= second && second <= 0x39) {
163                                         // UCS mapping
164                                         if (start + 3 >= end) {
165                                                 // incomplete tail.
166 //                                              incomplete1 = bytes [start];
167 //                                              incomplete2 = bytes [start + 1];
168 //                                              if (start + 3 == end)
169 //                                                      incomplete3 = bytes [start + 2];
170                                                 ret += start + 3 == end ? 3 : 2;
171                                                 break;
172                                         }
173                                         long value = GB18030Source.FromGBX (bytes, start);
174                                         if (value < 0) {
175                                                 // invalid data.
176                                                 ret++;
177                                                 start -= (int) value;
178                                         } else if (value >= 0x10000) {
179                                                 // UTF16 surrogate
180                                                 ret += 2;
181                                                 start += 4;
182                                         } else {
183                                                 // UTF16 BMP
184                                                 ret++;
185                                                 start+= 4;
186                                         }
187                                 } else {
188                                         // GB2312 mapping
189                                         start += 2;
190                                         ret++;
191                                 }
192                         }
193                         return ret;
194                 }
195
196                 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
197                 {
198                         CheckRange (bytes, byteIndex, byteCount, chars, charIndex);
199
200                         int byteEnd = byteIndex + byteCount;
201                         int charStart = charIndex;
202
203                         while (byteIndex < byteEnd) {
204                                 if (bytes [byteIndex] < 0x80) {
205                                         chars [charIndex++] = (char) bytes [byteIndex++];
206                                         continue;
207                                 }
208                                 else if (bytes [byteIndex] == 0x80) {
209                                         // Euro sign - actually it is obsolete,
210                                         // now it's just reserved but not used
211                                         chars [charIndex++] = '\u20AC';
212                                         byteIndex++;
213                                         continue;
214                                 }
215                                 else if (bytes [byteIndex] == 0xFF) {
216                                         // invalid data - fill '?'
217                                         chars [charIndex++] = '?';
218                                         byteIndex++;
219                                         continue;
220                                 }
221                                 else if (byteIndex + 1 >= byteEnd) {
222                                         //incomplete1 = bytes [byteIndex++];
223                                         //incomplete2 = -1;
224                                         //incomplete3 = -1;
225                                         break; // incomplete tail.
226                                 }
227
228                                 byte second = bytes [byteIndex + 1];
229                                 if (second == 0x7F || second == 0xFF) {
230                                         // invalid data
231                                         chars [charIndex++] = '?';
232                                         byteIndex += 2;
233                                 }
234                                 else if (0x30 <= second && second <= 0x39) {
235                                         // UCS mapping
236                                         if (byteIndex + 3 >= byteEnd) {
237                                                 // incomplete tail.
238                                                 //incomplete1 = bytes [byteIndex];
239                                                 //incomplete2 = bytes [byteIndex + 1];
240                                                 //if (byteIndex + 3 == byteEnd)
241                                                 //      incomplete3 = bytes [byteIndex + 2];
242                                                 break;
243                                         }
244                                         long value = GB18030Source.FromGBX (bytes, byteIndex);
245                                         if (value < 0) {
246                                                 // invalid data.
247                                                 chars [charIndex++] = '?';
248                                                 byteIndex -= (int) value;
249                                         } else if (value >= 0x10000) {
250                                                 // UTF16 surrogate
251                                                 value -= 0x10000;
252                                                 chars [charIndex++] = (char) (value / 0x400 + 0xD800);
253                                                 chars [charIndex++] = (char) (value % 0x400 + 0xDC00);
254                                                 byteIndex += 4;
255                                         } else {
256                                                 // UTF16 BMP
257                                                 chars [charIndex++] = (char) value;
258                                                 byteIndex += 4;
259                                         }
260                                 } else {
261                                         byte first = bytes [byteIndex];
262                                         int ord = ((first - 0x81) * 191 + second - 0x40) * 2;
263                                         char c1 = ord < 0 || ord >= gb2312.n2u.Length ?
264                                                 '\0' : (char) (gb2312.n2u [ord] + gb2312.n2u [ord + 1] * 256);
265                                         if (c1 == 0)
266                                                 chars [charIndex++] = '?';
267                                         else
268                                                 chars [charIndex++] = c1;
269                                         byteIndex += 2;
270                                 }
271                         }
272
273                         return charIndex - charStart;
274                 }
275         }
276
277         class GB18030Encoder : MonoEncoder
278         {
279                 static DbcsConvert gb2312 = DbcsConvert.Gb2312;
280
281                 public GB18030Encoder (MonoEncoding owner)
282                         : base (owner)
283                 {
284                 }
285
286                 char incomplete_byte_count;
287                 char incomplete_bytes;
288
289                 public unsafe override int GetByteCountImpl (char* chars, int count, bool refresh)
290                 {
291                         int start = 0;
292                         int end = count;
293                         int ret = 0;
294                         while (start < end) {
295                                 char ch = chars [start];
296                                 if (ch < 0x80) {
297                                         // ASCII
298                                         ret++;
299                                         start++;
300                                         continue;
301                                 } else if (Char.IsSurrogate (ch)) {
302                                         // Surrogate
303                                         if (start + 1 == end) {
304                                                 incomplete_byte_count = ch;
305                                                 start++;
306                                         } else {
307                                                 ret += 4;
308                                                 start += 2;
309                                         }
310                                         continue;
311                                 }
312
313                                 if (ch < 0x80 || ch == 0xFF) {
314                                         // ASCII
315                                         ret++;
316                                         start++;
317                                         continue;
318                                 }
319
320                                 byte b1 = gb2312.u2n [((int) ch) * 2 + 1];
321                                 byte b2 = gb2312.u2n [((int) ch) * 2];
322                                 if (b1 != 0 && b2 != 0) {
323                                         // GB2312
324                                         ret += 2;
325                                         start++;
326                                         continue;
327                                 }
328
329                                 // non-GB2312
330                                 long value = GB18030Source.FromUCS (ch);
331                                 if (value < 0)
332                                         ret++; // invalid(?)
333                                 else
334                                         ret += 4;
335                                 start++;
336                         }
337
338                         if (refresh) {
339                                 if (incomplete_byte_count != char.MinValue)
340                                         ret++;
341                                 incomplete_byte_count = char.MinValue;
342                         }
343                         return ret;
344                 }
345
346                 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount, bool refresh)
347                 {
348                         int charIndex = 0;
349                         int byteIndex = 0;
350
351                         int charEnd = charIndex + charCount;
352                         int byteStart = byteIndex;
353                         char ch = incomplete_bytes;
354
355                         while (charIndex < charEnd) {
356                                 if (incomplete_bytes == char.MinValue)
357                                         ch = chars [charIndex++];
358                                 else
359                                         incomplete_bytes = char.MinValue;
360
361                                 if (ch < 0x80) {
362                                         // ASCII
363                                         bytes [byteIndex++] = (byte) ch;
364                                         continue;
365                                 } else if (Char.IsSurrogate (ch)) {
366                                         // Surrogate
367                                         if (charIndex == charEnd) {
368                                                 incomplete_bytes = ch;
369                                                 break; // incomplete
370                                         }
371                                         char ch2 = chars [charIndex++];
372                                         if (!Char.IsSurrogate (ch2)) {
373                                                 // invalid surrogate
374 #if NET_2_0
375                                                 HandleFallback (
376                                                         chars, ref charIndex, ref charCount,
377                                                         bytes, ref byteIndex, ref byteCount);
378 #else
379                                                 bytes [byteIndex++] = (byte) '?';
380 #endif
381                                                 continue;
382                                         }
383                                         int cp = (ch - 0xD800) * 0x400 + ch2 - 0xDC00;
384                                         GB18030Source.Unlinear (bytes + byteIndex, GB18030Source.FromUCSSurrogate (cp));
385                                         byteIndex += 4;
386                                         continue;
387                                 }
388
389
390                                 if (ch <= 0x80 || ch == 0xFF) {
391                                         // Character maps to itself
392                                         bytes [byteIndex++] = (byte) ch;
393                                         continue;
394                                 }
395
396                                 byte b1 = gb2312.u2n [((int) ch) * 2 + 1];
397                                 byte b2 = gb2312.u2n [((int) ch) * 2];
398                                 if (b1 != 0 && b2 != 0) {
399                                         bytes [byteIndex++] = b1;
400                                         bytes [byteIndex++] = b2;
401                                         continue;
402                                 }
403
404                                 long value = GB18030Source.FromUCS (ch);
405                                 if (value < 0)
406                                         bytes [byteIndex++] = 0x3F; // invalid(?)
407                                 else {
408                                         // non-GB2312
409                                         GB18030Source.Unlinear (bytes + byteIndex, value);
410                                         byteIndex += 4;
411                                 }
412                         }
413
414                         if (refresh) {
415                                 if (incomplete_bytes != char.MinValue)
416                                         bytes [byteIndex++] = 0x3F; // incomplete
417                                 incomplete_bytes = char.MinValue;
418                         }
419
420                         return byteIndex - byteStart;
421                 }
422         }
423 }