Merge pull request #2475 from sandyarmstrong/getcultureinfo-37848
[mono.git] / mcs / class / I18N / CJK / GB18030Source.cs
1 //
2 // GB18030Encoding.cs
3 //
4 // Author:
5 //      Atsushi Enomoto  <atsushi@ximian.com>
6 //
7 using System;
8 using System.Globalization;
9 using System.Reflection;
10 using System.Text;
11 using System.Runtime.InteropServices;
12 #if BUILD_GENERATOR
13 using System.IO;
14 using System.Xml;
15 #endif
16
17 namespace I18N.CJK
18 {
19         internal unsafe class GB18030Source
20         {
21                 class GB18030Map
22                 {
23                         public readonly int UStart;
24                         public readonly int UEnd;
25                         public readonly long GStart;
26                         public readonly long GEnd;
27                         public readonly bool Dummy; // This range is actually not usable.
28
29                         public GB18030Map (
30                                 int ustart, int uend, long gstart, long gend, bool dummy)
31                         {
32                                 this.UStart = ustart;
33                                 this.UEnd = uend;
34                                 this.GStart = gstart;
35                                 this.GEnd = gend;
36                                 this.Dummy = dummy;
37                         }
38                 }
39
40                 private GB18030Source ()
41                 {
42                 }
43
44                 static readonly byte *gbx2uni;
45                 static readonly byte *uni2gbx;
46                 static readonly int gbx2uniSize, uni2gbxSize;
47
48                 static GB18030Source ()
49                 {
50                         MethodInfo mi = typeof (Assembly).GetMethod (
51                                 "GetManifestResourceInternal",
52                                 BindingFlags.NonPublic | BindingFlags.Instance);
53
54                         int size = 0;
55                         Module mod = null;
56                         IntPtr ret = IntPtr.Zero;
57
58                         if (mi != null)
59                         {
60                                 ret = (IntPtr)mi.Invoke(
61                                  Assembly.GetExecutingAssembly(),
62                                  new object[] { "gb18030.table", size, mod });
63                         }
64                         else
65                         {
66                                 // DotNet's way ;)
67                                 using (var ms = Assembly.GetExecutingAssembly()
68                                         .GetManifestResourceStream("gb18030.table"))
69                                 {
70                                         var len = (int)ms.Length;
71                                         byte* buf = (byte*)Marshal.AllocHGlobal(sizeof(byte) * len);
72
73                                         for (int i = 0; i < len; i++)
74                                                 buf[i] = (byte)ms.ReadByte();
75                                         
76                                         ret = (IntPtr)buf;
77                                 }
78                         }
79
80                         if (ret != IntPtr.Zero) {
81                                 gbx2uni = (byte*) ((void*) ret);
82                                 gbx2uniSize =
83                                         (gbx2uni [0] << 24) + (gbx2uni [1] << 16) +
84                                         (gbx2uni [2] << 8) + (gbx2uni [3]);
85                                 gbx2uni += 4;
86                                 uni2gbx = gbx2uni + gbx2uniSize;
87                                 uni2gbxSize =
88                                         (uni2gbx [0] << 24) + (uni2gbx [1] << 16) +
89                                         (uni2gbx [2] << 8) + (uni2gbx [3]);
90                                 uni2gbx += 4;
91                         }
92                 }
93
94                 static readonly long gbxBase =
95                         FromGBXRaw (0x81, 0x30, 0x81, 0x30, false);
96                 static readonly long gbxSuppBase =
97                         FromGBXRaw (0x90, 0x30, 0x81, 0x30, false);
98
99                 // See http://icu.sourceforge.net/docs/papers/gb18030.html
100                 // and referenced XML mapping table.
101                 static readonly GB18030Map [] ranges = new GB18030Map [] {
102                         // rawmap: 0x0080-0x0451
103                         new GB18030Map (0x0452, 0x200F, FromGBXRaw (0x81, 0x30, 0xD3, 0x30, false), FromGBXRaw (0x81, 0x36, 0xA5, 0x31, false), false),
104                         // rawmap: 0x2010-0x2642
105                         new GB18030Map (0x2643, 0x2E80, FromGBXRaw (0x81, 0x37, 0xA8, 0x39, false), FromGBXRaw (0x81, 0x38, 0xFD, 0x38, false), false),
106                         // rawmap: 0x2E81-0x361A
107                         new GB18030Map (0x361B, 0x3917, FromGBXRaw (0x82, 0x30, 0xA6, 0x33, false), FromGBXRaw (0x82, 0x30, 0xF2, 0x37, false), false),
108                         // rawmap: 0x3918-0x3CE0
109                         new GB18030Map (0x3CE1, 0x4055, FromGBXRaw (0x82, 0x31, 0xD4, 0x38, false), FromGBXRaw (0x82, 0x32, 0xAF, 0x32, false), false),
110                         // rawmap: 0x4056-0x415F
111                         new GB18030Map (0x4160, 0x4336, FromGBXRaw (0x82, 0x32, 0xC9, 0x37, false), FromGBXRaw (0x82, 0x32, 0xF8, 0x37, false), false),
112                         // rawmap: 4337-0x44D6
113                         new GB18030Map (0x44D7, 0x464B, FromGBXRaw (0x82, 0x33, 0xA3, 0x39, false), FromGBXRaw (0x82, 0x33, 0xC9, 0x31, false), false),
114                         // rawmap: 0x464C-0x478D
115                         new GB18030Map (0x478E, 0x4946, FromGBXRaw (0x82, 0x33, 0xE8, 0x38, false), FromGBXRaw (0x82, 0x34, 0x96, 0x38, false), false),
116                         // rawmap: 0x4947-0x49B7
117                         new GB18030Map (0x49B8, 0x4C76, FromGBXRaw (0x82, 0x34, 0xA1, 0x31, false), FromGBXRaw (0x82, 0x34, 0xE7, 0x33, false), false),
118                         // rawmap: 0x4C77-0x4DFF
119
120                         // 4E00-9FA5 are all mapped in GB2312
121                         new GB18030Map (0x4E00, 0x9FA5, 0, 0, true),
122
123                         new GB18030Map (0x9FA6, 0xD7FF, FromGBXRaw (0x82, 0x35, 0x8F, 0x33, false), FromGBXRaw (0x83, 0x36, 0xC7, 0x38, false), false),
124
125                         // D800-DFFF are ignored (surrogate)
126                         // E000-E76B are all mapped in GB2312.
127                         new GB18030Map (0xD800, 0xE76B, 0, 0, true),
128
129                         // rawmap: 0xE76C-E884
130                         new GB18030Map (0xE865, 0xF92B, FromGBXRaw (0x83, 0x36, 0xD0, 0x30, false), FromGBXRaw (0x84, 0x30, 0x85, 0x34, false), false),
131                         // rawmap: 0xF92C-FA29
132                         new GB18030Map (0xFA2A, 0xFE2F, FromGBXRaw (0x84, 0x30, 0x9C, 0x38, false), FromGBXRaw (0x84, 0x31, 0x85, 0x37, false), false),
133                         // rawmap: 0xFE30-FFE5
134                         new GB18030Map (0xFFE6, 0xFFFF, FromGBXRaw (0x84, 0x31, 0xA2, 0x34, false), FromGBXRaw (0x84, 0x31, 0xA4, 0x39, false), false),
135                         };
136
137                 public static void Unlinear (byte [] bytes, int start, long gbx)
138                 {
139                         fixed (byte* bptr = bytes) {
140                                 Unlinear (bptr + start, gbx);
141                         }
142                 }
143
144                 public static unsafe void Unlinear (byte* bytes, long gbx)
145                 {
146                         bytes [3] = (byte) (gbx % 10 + 0x30);
147                         gbx /= 10;
148                         bytes [2] = (byte) (gbx % 126 + 0x81);
149                         gbx /= 126;
150                         bytes [1] = (byte) (gbx % 10 + 0x30);
151                         gbx /= 10;
152                         bytes [0] = (byte) (gbx + 0x81);
153                 }
154
155                 // negative (invalid) or positive (valid)
156                 public static long FromGBX (byte [] bytes, int start)
157                 {
158                         byte b1 = bytes [start];
159                         byte b2 = bytes [start + 1];
160                         byte b3 = bytes [start + 2];
161                         byte b4 = bytes [start + 3];
162                         if (b1 < 0x81 || b1 == 0xFF)
163                                 return -1;
164                         if (b2 < 0x30 || b2 > 0x39)
165                                 return -2;
166                         if (b3 < 0x81 || b3 == 0xFF)
167                                 return -3;
168                         if (b4 < 0x30 || b4 > 0x39)
169                                 return -4;
170                         if (b1 >= 0x90)
171                                 return FromGBXRaw (b1, b2, b3, b4, true);
172                         long linear = FromGBXRaw (b1, b2, b3, b4, false);
173
174                         long rawOffset = 0;
175                         long startIgnore = 0;
176
177                         for (int i = 0; i < ranges.Length; i++) {
178                                 GB18030Map m = ranges [i];
179                                 if (linear < m.GStart)
180                                         return ToUcsRaw ((int) (linear
181                                                 - startIgnore + rawOffset));
182                                 if (linear <= m.GEnd)
183                                         return linear - gbxBase - m.GStart
184                                                 + m.UStart;
185                                 if (m.GStart != 0) {
186                                         rawOffset += m.GStart - startIgnore;
187                                         startIgnore = m.GEnd + 1;
188                                 }
189                         }
190 //                      return ToUcsRaw ((int) (linear - gbxBase));
191                         throw new SystemException (String.Format ("GB18030 INTERNAL ERROR (should not happen): GBX {0:x02} {1:x02} {2:x02} {3:x02}", b1, b2, b3, b4));
192                 }
193
194                 public static long FromUCSSurrogate (int cp)
195                 {
196                         return cp + gbxSuppBase;
197                 }
198
199                 public static long FromUCS (int cp)
200                 {
201                         long rawOffset = 0;
202                         long startIgnore = 0x80;
203                         for (int i = 0; i < ranges.Length; i++) {
204                                 GB18030Map m = ranges [i];
205                                 if (cp < m.UStart)
206                                         return ToGbxRaw ((int) (cp
207                                                 - startIgnore + rawOffset));
208                                 if (cp <= m.UEnd)
209                                         return cp - m.UStart + m.GStart;
210                                 if (m.GStart != 0) {
211                                         rawOffset += m.UStart - startIgnore;
212                                         startIgnore = m.UEnd + 1;
213                                 }
214                         }
215                         throw new SystemException (String.Format ("GB18030 INTERNAL ERROR (should not happen): UCS {0:x06}", cp));
216                 }
217
218                 static long FromGBXRaw (
219                         byte b1, byte b2, byte b3, byte b4, bool supp)
220                 {
221                         // 126 = 0xFE - 0x80
222                         return (((b1 - (supp ? 0x90 : 0x81)) * 10 +
223                                 (b2 - 0x30)) * 126 +
224                                 (b3 - 0x81)) * 10 +
225                                 b4 - 0x30 + (supp ? 0x10000 : 0);
226                 }
227
228                 static int ToUcsRaw (int idx)
229                 {
230                         return gbx2uni [idx * 2] * 0x100 +
231                                 gbx2uni [idx * 2 + 1];
232                 }
233
234                 static long ToGbxRaw (int idx)
235                 {
236                         if (idx < 0 || idx * 2 + 1 >= uni2gbxSize)
237                                 return -1;
238                         return gbxBase + uni2gbx [idx * 2] * 0x100 + uni2gbx [idx * 2 + 1];
239                 }
240
241
242 #if BUILD_GENERATOR
243                 public static void Main ()
244                 {
245                         new GB18030Source ().Run ();
246                 }
247
248                 byte [] uni2gbxMap;
249                 byte [] gbx2uniMap;
250
251                 void Run ()
252                 {
253                         int ustart = 0x80;
254                         long gstart = 0;
255                         int ucount = 0;
256                         long gcount = 0;
257                         bool skip = false;
258                         for (int i = 0; i < ranges.Length; i++) {
259                                 GB18030Map m = ranges [i];
260                                 if (!skip) {
261 //Console.WriteLine ("---- adding {0:X04} umap. {1:X04} gmap, skip range between {2:X04} and {3:X04}", m.UStart - ustart, m.GStart != 0 ? m.GStart - gstart : 0, m.UStart, m.UEnd);
262                                         ucount += m.UStart - ustart;
263                                 }
264                                 if (m.GStart != 0)
265                                         gcount += m.GStart - gstart;
266                                 skip = m.GStart == 0;
267                                 ustart = m.UEnd + 1;
268                                 if (m.GStart != 0)
269                                         gstart = m.GEnd + 1;
270                         }
271
272 Console.Error.WriteLine ("Total UCS codepoints: {0} ({1:X04})", ucount, ucount);
273 Console.Error.WriteLine ("Total GBX codepoints: {0} ({1:X04})", gcount, gcount);
274
275                         uni2gbxMap = new byte [ucount * 2];
276                         gbx2uniMap = new byte [gcount * 2];
277
278                         XmlDocument doc = new XmlDocument ();
279                         doc.XmlResolver = null;
280                         doc.Load ("gb-18030-2000.xml");
281                         foreach (XmlElement e in doc.SelectNodes (
282                                 "/characterMapping/assignments/a"))
283                                 AddMap (e);
284
285                         using (FileStream fs = File.Create ("gb18030.table")) {
286                                 byte [] size = new byte [4];
287                                 for (int i = 0, len = gbx2uniMap.Length;
288                                         i < 4; i++, len >>= 8)
289                                         size [3 - i] = (byte) (len % 0x100);
290                                 fs.Write (size, 0, 4);
291                                 fs.Write (gbx2uniMap, 0, gbx2uniMap.Length);
292                                 fs.Write (uni2gbxMap, 0, uni2gbxMap.Length);
293                         }
294 Console.WriteLine ("done.");
295                 }
296
297                 void AddMap (XmlElement e)
298                 {
299                         int u = int.Parse (e.GetAttribute ("u"),
300                                 NumberStyles.HexNumber);
301                         byte [] b = new byte [4];
302                         int idx = 0;
303                         foreach (string s in e.GetAttribute ("b").Split (' '))
304                                 b [idx++] =
305                                         byte.Parse (s, NumberStyles.HexNumber);
306                         if (idx != 4)
307                                 return;
308
309                         AddMap (u, b);
310                 }
311
312                 void AddMap (int u, byte [] b)
313                 {
314                         int gbx = (int) (FromGBXRaw (
315                                 b [0], b [1], b [2], b [3], false) - gbxBase);
316                         if (u > 0x10000 || gbx > 0x10000)
317                                 throw new Exception (String.Format (
318                                         "should not happen: {0:X04} {1:X04}",
319                                         u, gbx));
320
321                         int uidx = IndexForUcs (u);
322 //Console.WriteLine ("U: {0:x04} for {1:x04} [{2:x02} {3:x02}]", uidx, u, (byte) (gbx / 0x100), (byte) (gbx % 0x100));
323                         uni2gbxMap [uidx * 2] = (byte) (gbx / 0x100);
324                         uni2gbxMap [uidx * 2 + 1] = (byte) (gbx % 0x100);
325
326                         int gidx = IndexForGbx (gbx);
327 //Console.WriteLine ("G: {0:x04} for {1:x04} ({2:x02} {3:x02} {4:x02} {5:x02})", gidx, gbx, b [0], b [1], b [2], b [3]);
328                         gbx2uniMap [gidx * 2] = (byte) (u / 0x100);
329                         gbx2uniMap [gidx * 2 + 1] = (byte) (u % 0x100);
330                 }
331
332                 static int IndexForUcs (int ucs)
333                 {
334                         int start = 0x80;
335                         int count = 0;
336                         bool skip = false;
337                         for (int i = 0; i < ranges.Length; i++) {
338                                 GB18030Map m = ranges [i];
339                                 if (!skip) {
340                                         if (ucs < m.UStart)
341                                                 return count + ucs - start;
342                                         count += m.UStart - start;
343                                 }
344                                 skip = m.GStart == 0;
345                                 start = m.UEnd + 1;
346                         }
347                         return -1;
348                 }
349
350                 static int IndexForGbx (int gbx)
351                 {
352                         long start = 0;
353                         long count = 0;
354                         for (int i = 0; i < ranges.Length; i++) {
355                                 GB18030Map m = ranges [i];
356                                 if (m.GStart == 0)
357                                         continue;
358                                 if (gbx < m.GStart)
359                                         return (int) (count + gbx - start);
360                                 count += m.GStart - start;
361                                 start = m.GEnd + 1;
362                         }
363                         return -1;
364                 }
365
366 #endif
367
368
369         }
370
371 }