2006-01-23 Atsushi Enomoto <atsushi@ximian.com>
[mono.git] / mcs / class / I18N / CJK / GB18030Encoding.cs
1 //
2 // GB18030Encoding.cs
3 //
4 // Author:
5 //      Atsushi Enomoto  <atsushi@ximian.com>
6 //
7 using System;
8 using System.Reflection;
9 using System.Text;
10 using I18N.Common;
11
12 namespace I18N.CJK
13 {
14         [Serializable]
15         internal class ENCgb18030 : GB18030Encoding
16         {
17                 public ENCgb18030 (): base () {}
18         }
19
20         [Serializable]
21         public class CP54936 : GB18030Encoding { }
22
23         [Serializable]
24         public class GB18030Encoding : MonoEncoding
25         {
26                 // Constructor.
27                 public GB18030Encoding ()
28                         : base (54936)
29                 {
30                 }
31
32                 public override string EncodingName {
33                         get { return "Chinese Simplified (GB18030)"; }
34                 }
35
36                 public override string WebName {
37                         get { return "GB18030"; }
38                 }
39
40                 public override int GetMaxByteCount (int len)
41                 {
42                         // non-GB2312 characters in \u0080 - \uFFFF
43                         return len * 4;
44                 }
45
46                 public override int GetMaxCharCount (int len)
47                 {
48                         return len;
49                 }
50
51                 public override int GetByteCount (char [] chars, int index, int length)
52                 {
53                         return new GB18030Encoder (this).GetByteCount (chars, index, length, true);
54                 }
55
56                 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount)
57                 {
58                         return new GB18030Encoder (this).GetBytesImpl (chars, charCount, bytes, byteCount, true);
59                 }
60
61                 public override int GetCharCount (byte [] bytes, int start, int len)
62                 {
63                         return new GB18030Decoder ().GetCharCount (bytes, start, len);
64                 }
65
66                 public override int GetChars (byte [] bytes, int byteIdx, int srclen, char [] chars, int charIdx)
67                 {
68                         return new GB18030Decoder ().GetChars (bytes, byteIdx, srclen, chars, charIdx);
69                 }
70
71                 public override Encoder GetEncoder ()
72                 {
73                         return new GB18030Encoder (this);
74                 }
75
76                 public override Decoder GetDecoder ()
77                 {
78                         return new GB18030Decoder ();
79                 }
80         }
81
82         class GB18030Decoder : DbcsEncoding.DbcsDecoder
83         {
84                 static DbcsConvert gb2312 = DbcsConvert.Gb2312;
85                 // for now incomplete block is not supported - should we?
86                 // int incomplete1 = -1, incomplete2 = -1, incomplete3 = -1;
87
88                 public GB18030Decoder ()
89                         : base (null)
90                 {
91                 }
92
93                 public override int GetCharCount (byte [] bytes, int start, int len)
94                 {
95                         CheckRange (bytes, start, len);
96
97                         int end = start + len;
98                         int ret = 0;
99                         while (start < end) {
100                                 if (bytes [start] < 0x80) {
101                                         ret++;
102                                         start++;
103                                         continue;
104                                 }
105                                 else if (bytes [start] == 0x80) {
106                                         // Euro sign - actually it is obsolete,
107                                         // now it's just reserved but not used
108                                         ret++;
109                                         start++;
110                                         continue;
111                                 }
112                                 else if (bytes [start] == 0xFF) {
113                                         // invalid data - fill '?'
114                                         ret++;
115                                         start++;
116                                         continue;
117                                 }
118                                 else if (start + 1 >= end) {
119 //                                      incomplete1 = bytes [start];
120 //                                      incomplete2 = -1;
121 //                                      incomplete3 = -1;
122                                         ret++;
123                                         break; // incomplete tail.
124                                 }
125
126                                 byte second = bytes [start + 1];
127                                 if (second == 0x7F || second == 0xFF) {
128                                         // invalid data
129                                         ret++;
130                                         start += 2;
131                                         continue;
132                                 }
133                                 else if (0x30 <= second && second <= 0x39) {
134                                         // UCS mapping
135                                         if (start + 3 >= end) {
136                                                 // incomplete tail.
137 //                                              incomplete1 = bytes [start];
138 //                                              incomplete2 = bytes [start + 1];
139 //                                              if (start + 3 == end)
140 //                                                      incomplete3 = bytes [start + 2];
141                                                 ret += start + 3 == end ? 3 : 2;
142                                                 break;
143                                         }
144                                         long value = GB18030Source.FromGBX (bytes, start);
145                                         if (value < 0) {
146                                                 // invalid data.
147                                                 ret++;
148                                                 start -= (int) value;
149                                         } else if (value >= 0x10000) {
150                                                 // UTF16 surrogate
151                                                 ret += 2;
152                                                 start += 4;
153                                         } else {
154                                                 // UTF16 BMP
155                                                 ret++;
156                                                 start+= 4;
157                                         }
158                                 } else {
159                                         // GB2312 mapping
160                                         start += 2;
161                                         ret++;
162                                 }
163                         }
164                         return ret;
165                 }
166
167                 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
168                 {
169                         CheckRange (bytes, byteIndex, byteCount, chars, charIndex);
170
171                         int byteEnd = byteIndex + byteCount;
172                         int charStart = charIndex;
173
174                         while (byteIndex < byteEnd) {
175                                 if (bytes [byteIndex] < 0x80) {
176                                         chars [charIndex++] = (char) bytes [byteIndex++];
177                                         continue;
178                                 }
179                                 else if (bytes [byteIndex] == 0x80) {
180                                         // Euro sign - actually it is obsolete,
181                                         // now it's just reserved but not used
182                                         chars [charIndex++] = '\u20AC';
183                                         byteIndex++;
184                                         continue;
185                                 }
186                                 else if (bytes [byteIndex] == 0xFF) {
187                                         // invalid data - fill '?'
188                                         chars [charIndex++] = '?';
189                                         byteIndex++;
190                                         continue;
191                                 }
192                                 else if (byteIndex + 1 >= byteEnd) {
193                                         //incomplete1 = bytes [byteIndex++];
194                                         //incomplete2 = -1;
195                                         //incomplete3 = -1;
196                                         break; // incomplete tail.
197                                 }
198
199                                 byte second = bytes [byteIndex + 1];
200                                 if (second == 0x7F || second == 0xFF) {
201                                         // invalid data
202                                         chars [charIndex++] = '?';
203                                         byteIndex += 2;
204                                 }
205                                 else if (0x30 <= second && second <= 0x39) {
206                                         // UCS mapping
207                                         if (byteIndex + 3 >= byteEnd) {
208                                                 // incomplete tail.
209                                                 //incomplete1 = bytes [byteIndex];
210                                                 //incomplete2 = bytes [byteIndex + 1];
211                                                 //if (byteIndex + 3 == byteEnd)
212                                                 //      incomplete3 = bytes [byteIndex + 2];
213                                                 break;
214                                         }
215                                         long value = GB18030Source.FromGBX (bytes, byteIndex);
216                                         if (value < 0) {
217                                                 // invalid data.
218                                                 chars [charIndex++] = '?';
219                                                 byteIndex -= (int) value;
220                                         } else if (value >= 0x10000) {
221                                                 // UTF16 surrogate
222                                                 value -= 0x10000;
223                                                 chars [charIndex++] = (char) (value / 0x400 + 0xD800);
224                                                 chars [charIndex++] = (char) (value % 0x400 + 0xDC00);
225                                                 byteIndex += 4;
226                                         } else {
227                                                 // UTF16 BMP
228                                                 chars [charIndex++] = (char) value;
229                                                 byteIndex += 4;
230                                         }
231                                 } else {
232                                         byte first = bytes [byteIndex];
233                                         int ord = ((first - 0x81) * 191 + second - 0x40) * 2;
234                                         char c1 = ord < 0 || ord >= gb2312.n2u.Length ?
235                                                 '\0' : (char) (gb2312.n2u [ord] + gb2312.n2u [ord + 1] * 256);
236                                         if (c1 == 0)
237                                                 chars [charIndex++] = '?';
238                                         else
239                                                 chars [charIndex++] = c1;
240                                         byteIndex += 2;
241                                 }
242                         }
243
244                         return charIndex - charStart;
245                 }
246         }
247
248         class GB18030Encoder : MonoEncoding.MonoEncoder
249         {
250                 static DbcsConvert gb2312 = DbcsConvert.Gb2312;
251
252                 public GB18030Encoder (MonoEncoding owner)
253                         : base (owner)
254                 {
255                 }
256
257                 char incomplete_byte_count;
258                 char incomplete_bytes;
259
260                 public override int GetByteCount (char [] chars, int start, int len, bool refresh)
261                 {
262                         if (chars == null)
263                                 throw new ArgumentNullException ("chars");
264                         if (start < 0 || start > chars.Length)
265                                 throw new ArgumentOutOfRangeException ("index");
266                         if (len < 0 || start + len > chars.Length)
267                                 throw new ArgumentOutOfRangeException ("count");
268
269                         int end = start + len;
270                         int ret = 0;
271                         while (start < end) {
272                                 char ch = chars [start];
273                                 if (ch < 0x80) {
274                                         // ASCII
275                                         ret++;
276                                         start++;
277                                         continue;
278                                 } else if (Char.IsSurrogate (ch)) {
279                                         // Surrogate
280                                         if (start + 1 == end) {
281                                                 incomplete_byte_count = ch;
282                                                 start++;
283                                         } else {
284                                                 ret += 4;
285                                                 start += 2;
286                                         }
287                                         continue;
288                                 }
289
290                                 if (ch < 0x80 || ch == 0xFF) {
291                                         // ASCII
292                                         ret++;
293                                         start++;
294                                         continue;
295                                 }
296
297                                 byte b1 = gb2312.u2n [((int) ch) * 2 + 1];
298                                 byte b2 = gb2312.u2n [((int) ch) * 2];
299                                 if (b1 != 0 && b2 != 0) {
300                                         // GB2312
301                                         ret += 2;
302                                         start++;
303                                         continue;
304                                 }
305
306                                 // non-GB2312
307                                 long value = GB18030Source.FromUCS (ch);
308                                 if (value < 0)
309                                         ret++; // invalid(?)
310                                 else
311                                         ret += 4;
312                                 start++;
313                         }
314
315                         if (refresh) {
316                                 if (incomplete_byte_count != char.MinValue)
317                                         ret++;
318                                 incomplete_byte_count = char.MinValue;
319                         }
320                         return ret;
321                 }
322
323                 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount, bool refresh)
324                 {
325                         int charIndex = 0;
326                         int byteIndex = 0;
327 #if NET_2_0
328                         EncoderFallbackBuffer buffer = null;
329 #endif
330
331                         int charEnd = charIndex + charCount;
332                         int byteStart = byteIndex;
333                         char ch = incomplete_bytes;
334
335                         while (charIndex < charEnd) {
336                                 if (incomplete_bytes == char.MinValue)
337                                         ch = chars [charIndex++];
338                                 else
339                                         incomplete_bytes = char.MinValue;
340
341                                 if (ch < 0x80) {
342                                         // ASCII
343                                         bytes [byteIndex++] = (byte) ch;
344                                         continue;
345                                 } else if (Char.IsSurrogate (ch)) {
346                                         // Surrogate
347                                         if (charIndex == charEnd) {
348                                                 incomplete_bytes = ch;
349                                                 break; // incomplete
350                                         }
351                                         char ch2 = chars [charIndex++];
352                                         if (!Char.IsSurrogate (ch2)) {
353                                                 // invalid surrogate
354 #if NET_2_0
355                                                 HandleFallback (
356                                                         chars, ref charIndex, ref charCount,
357                                                         bytes, ref byteIndex, ref byteCount);
358 #else
359                                                 bytes [byteIndex++] = (byte) '?';
360 #endif
361                                                 continue;
362                                         }
363                                         int cp = (ch - 0xD800) * 0x400 + ch2 - 0xDC00;
364                                         GB18030Source.Unlinear (bytes + byteIndex, GB18030Source.FromUCSSurrogate (cp));
365                                         byteIndex += 4;
366                                         continue;
367                                 }
368
369
370                                 if (ch <= 0x80 || ch == 0xFF) {
371                                         // Character maps to itself
372                                         bytes [byteIndex++] = (byte) ch;
373                                         continue;
374                                 }
375
376                                 byte b1 = gb2312.u2n [((int) ch) * 2 + 1];
377                                 byte b2 = gb2312.u2n [((int) ch) * 2];
378                                 if (b1 != 0 && b2 != 0) {
379                                         bytes [byteIndex++] = b1;
380                                         bytes [byteIndex++] = b2;
381                                         continue;
382                                 }
383
384                                 long value = GB18030Source.FromUCS (ch);
385                                 if (value < 0)
386                                         bytes [byteIndex++] = 0x3F; // invalid(?)
387                                 else {
388                                         // non-GB2312
389                                         GB18030Source.Unlinear (bytes + byteIndex, value);
390                                         byteIndex += 4;
391                                 }
392                         }
393
394                         if (refresh) {
395                                 if (incomplete_bytes != char.MinValue)
396                                         bytes [byteIndex++] = 0x3F; // incomplete
397                                 incomplete_bytes = char.MinValue;
398                         }
399
400                         return byteIndex - byteStart;
401                 }
402         }
403 }