2005-11-24 Chris Toshok <toshok@ximian.com>
[mono.git] / mcs / class / I18N / CJK / GB18030Encoding.cs
1 //
2 // GB18030Encoding.cs
3 //
4 // Author:
5 //      Atsushi Enomoto  <atsushi@ximian.com>
6 //
7 using System;
8 using System.Reflection;
9 using System.Text;
10
11 namespace I18N.CJK
12 {
13         internal class ENCgb18030 : GB18030Encoding
14         {
15                 public ENCgb18030 (): base () {}
16         }
17
18         public class CP54936 : GB18030Encoding { }
19
20         public class GB18030Encoding : Encoding
21         {
22                 // Constructor.
23                 public GB18030Encoding ()
24                         : base (54936)
25                 {
26                 }
27
28                 public override string EncodingName {
29                         get { return "Chinese Simplified (GB18030)"; }
30                 }
31
32                 public override string WebName {
33                         get { return "GB18030"; }
34                 }
35
36                 public override int GetMaxByteCount (int len)
37                 {
38                         // non-GB2312 characters in \u0080 - \uFFFF
39                         return len * 4;
40                 }
41
42                 public override int GetMaxCharCount (int len)
43                 {
44                         return len;
45                 }
46
47                 public override int GetByteCount (char [] chars, int index, int length)
48                 {
49                         return new GB18030Encoder ().GetByteCount (chars, index, length, true);
50                 }
51
52                 public override int GetBytes (char [] chars, int charIdx, int srclen, byte [] bytes, int byteIdx)
53                 {
54                         return new GB18030Encoder ().GetBytes (chars, charIdx, srclen, bytes, byteIdx, true);
55                 }
56
57                 public override int GetCharCount (byte [] bytes, int start, int len)
58                 {
59                         return new GB18030Decoder ().GetCharCount (bytes, start, len);
60                 }
61
62                 public override int GetChars (byte [] bytes, int byteIdx, int srclen, char [] chars, int charIdx)
63                 {
64                         return new GB18030Decoder ().GetChars (bytes, byteIdx, srclen, chars, charIdx);
65                 }
66         }
67
68         class GB18030Decoder : Decoder
69         {
70                 Gb2312Convert gb2312 = Gb2312Convert.Convert;
71                 // for now incomplete block is not supported - should we?
72                 // int incomplete1 = -1, incomplete2 = -1, incomplete3 = -1;
73
74                 public override int GetCharCount (byte [] bytes, int start, int len)
75                 {
76                         if (bytes == null)
77                                 throw new ArgumentNullException ("bytes");
78                         if (start < 0 || start > bytes.Length)
79                                 throw new ArgumentOutOfRangeException ("start");
80                         if (len < 0 || start + len > bytes.Length)
81                                 throw new ArgumentOutOfRangeException ("len");
82
83                         int end = start + len;
84                         int ret = 0;
85                         while (start < end) {
86                                 if (bytes [start] < 0x80) {
87                                         ret++;
88                                         start++;
89                                         continue;
90                                 }
91                                 else if (bytes [start] == 0x80) {
92                                         // Euro sign - actually it is obsolete,
93                                         // now it's just reserved but not used
94                                         ret++;
95                                         start++;
96                                         continue;
97                                 }
98                                 else if (bytes [start] == 0xFF) {
99                                         // invalid data - fill '?'
100                                         ret++;
101                                         start++;
102                                         continue;
103                                 }
104                                 else if (start + 1 >= end) {
105 //                                      incomplete1 = bytes [start];
106 //                                      incomplete2 = -1;
107 //                                      incomplete3 = -1;
108                                         ret++;
109                                         break; // incomplete tail.
110                                 }
111
112                                 byte second = bytes [start + 1];
113                                 if (second == 0x7F || second == 0xFF) {
114                                         // invalid data
115                                         ret++;
116                                         start += 2;
117                                         continue;
118                                 }
119                                 else if (0x30 <= second && second <= 0x39) {
120                                         // UCS mapping
121                                         if (start + 3 >= end) {
122                                                 // incomplete tail.
123 //                                              incomplete1 = bytes [start];
124 //                                              incomplete2 = bytes [start + 1];
125 //                                              if (start + 3 == end)
126 //                                                      incomplete3 = bytes [start + 2];
127                                                 ret += start + 3 == end ? 3 : 2;
128                                                 break;
129                                         }
130                                         long value = GB18030Source.FromGBX (bytes, start);
131                                         if (value < 0) {
132                                                 // invalid data.
133                                                 ret++;
134                                                 start -= (int) value;
135                                         } else if (value >= 0x10000) {
136                                                 // UTF16 surrogate
137                                                 ret += 2;
138                                                 start += 4;
139                                         } else {
140                                                 // UTF16 BMP
141                                                 ret++;
142                                                 start+= 4;
143                                         }
144                                 } else {
145                                         // GB2312 mapping
146                                         start += 2;
147                                         ret++;
148                                 }
149                         }
150                         return ret;
151                 }
152
153                 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)
154                 {
155                         if (bytes == null)
156                                 throw new ArgumentNullException ("bytes");
157                         if (chars == null)
158                                 throw new ArgumentNullException ("chars");
159                         if (byteIndex < 0 || byteIndex > bytes.Length)
160                                 throw new ArgumentOutOfRangeException ("byteIndex");
161                         if (byteCount < 0 || byteIndex + byteCount > bytes.Length)
162                                 throw new ArgumentOutOfRangeException ("byteCount");
163                         if (charIndex < 0 || charIndex > chars.Length)
164                                 throw new ArgumentOutOfRangeException ("charIndex");
165
166                         int byteEnd = byteIndex + byteCount;
167                         int charStart = charIndex;
168
169                         while (byteIndex < byteEnd) {
170                                 if (bytes [byteIndex] < 0x80) {
171                                         chars [charIndex++] = (char) bytes [byteIndex++];
172                                         continue;
173                                 }
174                                 else if (bytes [byteIndex] == 0x80) {
175                                         // Euro sign - actually it is obsolete,
176                                         // now it's just reserved but not used
177                                         chars [charIndex++] = '\u20AC';
178                                         byteIndex++;
179                                         continue;
180                                 }
181                                 else if (bytes [byteIndex] == 0xFF) {
182                                         // invalid data - fill '?'
183                                         chars [charIndex++] = '?';
184                                         byteIndex++;
185                                         continue;
186                                 }
187                                 else if (byteIndex + 1 >= byteEnd) {
188                                         //incomplete1 = bytes [byteIndex++];
189                                         //incomplete2 = -1;
190                                         //incomplete3 = -1;
191                                         break; // incomplete tail.
192                                 }
193
194                                 byte second = bytes [byteIndex + 1];
195                                 if (second == 0x7F || second == 0xFF) {
196                                         // invalid data
197                                         chars [charIndex++] = '?';
198                                         byteIndex += 2;
199                                 }
200                                 else if (0x30 <= second && second <= 0x39) {
201                                         // UCS mapping
202                                         if (byteIndex + 3 >= byteEnd) {
203                                                 // incomplete tail.
204                                                 //incomplete1 = bytes [byteIndex];
205                                                 //incomplete2 = bytes [byteIndex + 1];
206                                                 //if (byteIndex + 3 == byteEnd)
207                                                 //      incomplete3 = bytes [byteIndex + 2];
208                                                 break;
209                                         }
210                                         long value = GB18030Source.FromGBX (bytes, byteIndex);
211                                         if (value < 0) {
212                                                 // invalid data.
213                                                 chars [charIndex++] = '?';
214                                                 byteIndex -= (int) value;
215                                         } else if (value >= 0x10000) {
216                                                 // UTF16 surrogate
217                                                 value -= 0x10000;
218                                                 chars [charIndex++] = (char) (value / 0x400 + 0xD800);
219                                                 chars [charIndex++] = (char) (value % 0x400 + 0xDC00);
220                                                 byteIndex += 4;
221                                         } else {
222                                                 // UTF16 BMP
223                                                 chars [charIndex++] = (char) value;
224                                                 byteIndex += 4;
225                                         }
226                                 } else {
227                                         // GB2312 mapping, or invalid.
228                                         // ('second' is always valid here).
229                                         int head = bytes [byteIndex];
230                                         char c = gb2312.BytePairToChar (ref head, second);
231                                         byteIndex += 2;
232                                         chars [charIndex++] = c == char.MinValue ? '?' : c;
233                                 }
234                         }
235
236                         return charIndex - charStart;
237                 }
238         }
239
240         class GB18030Encoder : Encoder
241         {
242                 Gb2312Convert gb2312 = Gb2312Convert.Convert;
243                 char incomplete;
244
245                 public override int GetByteCount (char [] chars, int start, int len, bool refresh)
246                 {
247                         if (refresh)
248                                 incomplete = char.MinValue;
249
250                         if (chars == null)
251                                 throw new ArgumentNullException ("chars");
252                         if (start < 0 || start > chars.Length)
253                                 throw new ArgumentOutOfRangeException ("index");
254                         if (len < 0 || start + len > chars.Length)
255                                 throw new ArgumentOutOfRangeException ("count");
256
257                         int end = start + len;
258                         int ret = 0;
259                         while (start < end) {
260                                 char ch = chars [start];
261                                 if (ch < 0x80) {
262                                         // ASCII
263                                         ret++;
264                                         start++;
265                                         continue;
266                                 } else if (Char.IsSurrogate (ch)) {
267                                         // Surrogate
268                                         if (start + 1 == end)
269                                                 break; // incomplete
270                                         ret += 4;
271                                         start += 2;
272                                         continue;
273                                 }
274
275                                 if (ch < 0x80 || ch == 0xFF) {
276                                         // ASCII
277                                         ret++;
278                                         start++;
279                                         continue;
280                                 }
281                                 long value = gb2312.UcsToGbk (ch);
282                                 if (value != 0) {
283                                         // GB2312
284                                         ret += 2;
285                                         start++;
286                                         continue;
287                                 }
288
289                                 // non-GB2312
290                                 ret += 4;
291                                 start++;
292                         }
293                         return ret;
294                 }
295
296                 public override int GetBytes (char [] chars, int charIndex, int charCount, byte [] bytes, int byteIndex, bool refresh)
297                 {
298                         if (chars == null)
299                                 throw new ArgumentNullException ("chars");
300                         if (bytes == null)
301                                 throw new ArgumentNullException ("bytes");
302                         if (charIndex < 0 || charIndex > chars.Length)
303                                 throw new ArgumentOutOfRangeException ("charIndex");
304                         if (charCount < 0 || charIndex + charCount > chars.Length)
305                                 throw new ArgumentOutOfRangeException ("charCount");
306                         if (byteIndex < 0 || byteIndex > bytes.Length)
307                                 throw new ArgumentOutOfRangeException ("byteIndex");
308
309                         int charEnd = charIndex + charCount;
310                         int byteStart = byteIndex;
311                         char ch = incomplete;
312
313                         while (charIndex < charEnd) {
314                                 if (incomplete == char.MinValue)
315                                         ch = chars [charIndex++];
316                                 else
317                                         incomplete = char.MinValue;
318
319                                 if (ch < 0x80) {
320                                         // ASCII
321                                         bytes [byteIndex++] = (byte) ch;
322                                         continue;
323                                 } else if (Char.IsSurrogate (ch)) {
324                                         // Surrogate
325                                         if (charIndex == charEnd) {
326                                                 incomplete = ch;
327                                                 break; // incomplete
328                                         }
329                                         char ch2 = chars [charIndex++];
330                                         if (!Char.IsSurrogate (ch2)) {
331                                                 // invalid surrogate
332                                                 bytes [byteIndex++] = (byte) '?';
333                                                 continue;
334                                         }
335                                         int cp = (ch - 0xD800) * 0x400 + ch2 - 0xDC00;
336                                         GB18030Source.Unlinear (bytes, byteIndex, GB18030Source.FromUCSSurrogate (cp));
337                                         byteIndex += 4;
338                                         continue;
339                                 }
340
341
342                                 if (ch <= 0x80 || ch == 0xFF) {
343                                         // Character maps to itself
344                                         bytes [byteIndex++] = (byte) ch;
345                                         continue;
346                                 }
347
348                                 long value = gb2312.UcsToGbk (ch);
349                                 if (value != 0) {
350                                         bytes [byteIndex++] = (byte) (value / 0x100);
351                                         bytes [byteIndex++] = (byte) (value % 0x100);
352                                         continue;
353                                 }
354
355                                 value = GB18030Source.FromUCS (ch);
356                                 // non-GB2312
357                                 GB18030Source.Unlinear (bytes, byteIndex, value);
358                                 byteIndex += 4;
359                         }
360                         return byteIndex - byteStart;
361                 }
362         }
363 }