5 // Atsushi Enomoto <atsushi@ximian.com>
8 using System.Globalization;
9 using System.Reflection;
11 using System.Runtime.InteropServices;
19 internal unsafe class GB18030Source
23 public readonly int UStart;
24 public readonly int UEnd;
25 public readonly long GStart;
26 public readonly long GEnd;
27 public readonly bool Dummy; // This range is actually not usable.
30 int ustart, int uend, long gstart, long gend, bool dummy)
40 private GB18030Source ()
44 static readonly byte *gbx2uni;
45 static readonly byte *uni2gbx;
46 static readonly int gbx2uniSize, uni2gbxSize;
48 static GB18030Source ()
50 MethodInfo mi = typeof (Assembly).GetMethod (
51 "GetManifestResourceInternal",
52 BindingFlags.NonPublic | BindingFlags.Instance);
56 IntPtr ret = IntPtr.Zero;
60 ret = (IntPtr)mi.Invoke(
61 Assembly.GetExecutingAssembly(),
62 new object[] { "gb18030.table", size, mod });
67 using (var ms = Assembly.GetExecutingAssembly()
68 .GetManifestResourceStream("gb18030.table"))
70 var len = (int)ms.Length;
71 byte* buf = (byte*)Marshal.AllocHGlobal(sizeof(byte) * len);
73 for (int i = 0; i < len; i++)
74 buf[i] = (byte)ms.ReadByte();
80 if (ret != IntPtr.Zero) {
81 gbx2uni = (byte*) ((void*) ret);
83 (gbx2uni [0] << 24) + (gbx2uni [1] << 16) +
84 (gbx2uni [2] << 8) + (gbx2uni [3]);
86 uni2gbx = gbx2uni + gbx2uniSize;
88 (uni2gbx [0] << 24) + (uni2gbx [1] << 16) +
89 (uni2gbx [2] << 8) + (uni2gbx [3]);
94 static readonly long gbxBase =
95 FromGBXRaw (0x81, 0x30, 0x81, 0x30, false);
96 static readonly long gbxSuppBase =
97 FromGBXRaw (0x90, 0x30, 0x81, 0x30, false);
99 // See http://icu.sourceforge.net/docs/papers/gb18030.html
100 // and referenced XML mapping table.
101 static readonly GB18030Map [] ranges = new GB18030Map [] {
102 // rawmap: 0x0080-0x0451
103 new GB18030Map (0x0452, 0x200F, FromGBXRaw (0x81, 0x30, 0xD3, 0x30, false), FromGBXRaw (0x81, 0x36, 0xA5, 0x31, false), false),
104 // rawmap: 0x2010-0x2642
105 new GB18030Map (0x2643, 0x2E80, FromGBXRaw (0x81, 0x37, 0xA8, 0x39, false), FromGBXRaw (0x81, 0x38, 0xFD, 0x38, false), false),
106 // rawmap: 0x2E81-0x361A
107 new GB18030Map (0x361B, 0x3917, FromGBXRaw (0x82, 0x30, 0xA6, 0x33, false), FromGBXRaw (0x82, 0x30, 0xF2, 0x37, false), false),
108 // rawmap: 0x3918-0x3CE0
109 new GB18030Map (0x3CE1, 0x4055, FromGBXRaw (0x82, 0x31, 0xD4, 0x38, false), FromGBXRaw (0x82, 0x32, 0xAF, 0x32, false), false),
110 // rawmap: 0x4056-0x415F
111 new GB18030Map (0x4160, 0x4336, FromGBXRaw (0x82, 0x32, 0xC9, 0x37, false), FromGBXRaw (0x82, 0x32, 0xF8, 0x37, false), false),
112 // rawmap: 4337-0x44D6
113 new GB18030Map (0x44D7, 0x464B, FromGBXRaw (0x82, 0x33, 0xA3, 0x39, false), FromGBXRaw (0x82, 0x33, 0xC9, 0x31, false), false),
114 // rawmap: 0x464C-0x478D
115 new GB18030Map (0x478E, 0x4946, FromGBXRaw (0x82, 0x33, 0xE8, 0x38, false), FromGBXRaw (0x82, 0x34, 0x96, 0x38, false), false),
116 // rawmap: 0x4947-0x49B7
117 new GB18030Map (0x49B8, 0x4C76, FromGBXRaw (0x82, 0x34, 0xA1, 0x31, false), FromGBXRaw (0x82, 0x34, 0xE7, 0x33, false), false),
118 // rawmap: 0x4C77-0x4DFF
120 // 4E00-9FA5 are all mapped in GB2312
121 new GB18030Map (0x4E00, 0x9FA5, 0, 0, true),
123 new GB18030Map (0x9FA6, 0xD7FF, FromGBXRaw (0x82, 0x35, 0x8F, 0x33, false), FromGBXRaw (0x83, 0x36, 0xC7, 0x38, false), false),
125 // D800-DFFF are ignored (surrogate)
126 // E000-E76B are all mapped in GB2312.
127 new GB18030Map (0xD800, 0xE76B, 0, 0, true),
129 // rawmap: 0xE76C-E884
130 new GB18030Map (0xE865, 0xF92B, FromGBXRaw (0x83, 0x36, 0xD0, 0x30, false), FromGBXRaw (0x84, 0x30, 0x85, 0x34, false), false),
131 // rawmap: 0xF92C-FA29
132 new GB18030Map (0xFA2A, 0xFE2F, FromGBXRaw (0x84, 0x30, 0x9C, 0x38, false), FromGBXRaw (0x84, 0x31, 0x85, 0x37, false), false),
133 // rawmap: 0xFE30-FFE5
134 new GB18030Map (0xFFE6, 0xFFFF, FromGBXRaw (0x84, 0x31, 0xA2, 0x34, false), FromGBXRaw (0x84, 0x31, 0xA4, 0x39, false), false),
137 public static void Unlinear (byte [] bytes, int start, long gbx)
139 fixed (byte* bptr = bytes) {
140 Unlinear (bptr + start, gbx);
144 public static unsafe void Unlinear (byte* bytes, long gbx)
146 bytes [3] = (byte) (gbx % 10 + 0x30);
148 bytes [2] = (byte) (gbx % 126 + 0x81);
150 bytes [1] = (byte) (gbx % 10 + 0x30);
152 bytes [0] = (byte) (gbx + 0x81);
155 // negative (invalid) or positive (valid)
156 public static long FromGBX (byte [] bytes, int start)
158 byte b1 = bytes [start];
159 byte b2 = bytes [start + 1];
160 byte b3 = bytes [start + 2];
161 byte b4 = bytes [start + 3];
162 if (b1 < 0x81 || b1 == 0xFF)
164 if (b2 < 0x30 || b2 > 0x39)
166 if (b3 < 0x81 || b3 == 0xFF)
168 if (b4 < 0x30 || b4 > 0x39)
171 return FromGBXRaw (b1, b2, b3, b4, true);
172 long linear = FromGBXRaw (b1, b2, b3, b4, false);
175 long startIgnore = 0;
177 for (int i = 0; i < ranges.Length; i++) {
178 GB18030Map m = ranges [i];
179 if (linear < m.GStart)
180 return ToUcsRaw ((int) (linear
181 - startIgnore + rawOffset));
182 if (linear <= m.GEnd)
183 return linear - gbxBase - m.GStart
186 rawOffset += m.GStart - startIgnore;
187 startIgnore = m.GEnd + 1;
190 // return ToUcsRaw ((int) (linear - gbxBase));
191 throw new SystemException (String.Format ("GB18030 INTERNAL ERROR (should not happen): GBX {0:x02} {1:x02} {2:x02} {3:x02}", b1, b2, b3, b4));
194 public static long FromUCSSurrogate (int cp)
196 return cp + gbxSuppBase;
199 public static long FromUCS (int cp)
202 long startIgnore = 0x80;
203 for (int i = 0; i < ranges.Length; i++) {
204 GB18030Map m = ranges [i];
206 return ToGbxRaw ((int) (cp
207 - startIgnore + rawOffset));
209 return cp - m.UStart + m.GStart;
211 rawOffset += m.UStart - startIgnore;
212 startIgnore = m.UEnd + 1;
215 throw new SystemException (String.Format ("GB18030 INTERNAL ERROR (should not happen): UCS {0:x06}", cp));
218 static long FromGBXRaw (
219 byte b1, byte b2, byte b3, byte b4, bool supp)
222 return (((b1 - (supp ? 0x90 : 0x81)) * 10 +
225 b4 - 0x30 + (supp ? 0x10000 : 0);
228 static int ToUcsRaw (int idx)
230 return gbx2uni [idx * 2] * 0x100 +
231 gbx2uni [idx * 2 + 1];
234 static long ToGbxRaw (int idx)
236 if (idx < 0 || idx * 2 + 1 >= uni2gbxSize)
238 return gbxBase + uni2gbx [idx * 2] * 0x100 + uni2gbx [idx * 2 + 1];
243 public static void Main ()
245 new GB18030Source ().Run ();
258 for (int i = 0; i < ranges.Length; i++) {
259 GB18030Map m = ranges [i];
261 //Console.WriteLine ("---- adding {0:X04} umap. {1:X04} gmap, skip range between {2:X04} and {3:X04}", m.UStart - ustart, m.GStart != 0 ? m.GStart - gstart : 0, m.UStart, m.UEnd);
262 ucount += m.UStart - ustart;
265 gcount += m.GStart - gstart;
266 skip = m.GStart == 0;
272 Console.Error.WriteLine ("Total UCS codepoints: {0} ({1:X04})", ucount, ucount);
273 Console.Error.WriteLine ("Total GBX codepoints: {0} ({1:X04})", gcount, gcount);
275 uni2gbxMap = new byte [ucount * 2];
276 gbx2uniMap = new byte [gcount * 2];
278 XmlDocument doc = new XmlDocument ();
279 doc.XmlResolver = null;
280 doc.Load ("gb-18030-2000.xml");
281 foreach (XmlElement e in doc.SelectNodes (
282 "/characterMapping/assignments/a"))
285 using (FileStream fs = File.Create ("gb18030.table")) {
286 byte [] size = new byte [4];
287 for (int i = 0, len = gbx2uniMap.Length;
288 i < 4; i++, len >>= 8)
289 size [3 - i] = (byte) (len % 0x100);
290 fs.Write (size, 0, 4);
291 fs.Write (gbx2uniMap, 0, gbx2uniMap.Length);
292 fs.Write (uni2gbxMap, 0, uni2gbxMap.Length);
294 Console.WriteLine ("done.");
297 void AddMap (XmlElement e)
299 int u = int.Parse (e.GetAttribute ("u"),
300 NumberStyles.HexNumber);
301 byte [] b = new byte [4];
303 foreach (string s in e.GetAttribute ("b").Split (' '))
305 byte.Parse (s, NumberStyles.HexNumber);
312 void AddMap (int u, byte [] b)
314 int gbx = (int) (FromGBXRaw (
315 b [0], b [1], b [2], b [3], false) - gbxBase);
316 if (u > 0x10000 || gbx > 0x10000)
317 throw new Exception (String.Format (
318 "should not happen: {0:X04} {1:X04}",
321 int uidx = IndexForUcs (u);
322 //Console.WriteLine ("U: {0:x04} for {1:x04} [{2:x02} {3:x02}]", uidx, u, (byte) (gbx / 0x100), (byte) (gbx % 0x100));
323 uni2gbxMap [uidx * 2] = (byte) (gbx / 0x100);
324 uni2gbxMap [uidx * 2 + 1] = (byte) (gbx % 0x100);
326 int gidx = IndexForGbx (gbx);
327 //Console.WriteLine ("G: {0:x04} for {1:x04} ({2:x02} {3:x02} {4:x02} {5:x02})", gidx, gbx, b [0], b [1], b [2], b [3]);
328 gbx2uniMap [gidx * 2] = (byte) (u / 0x100);
329 gbx2uniMap [gidx * 2 + 1] = (byte) (u % 0x100);
332 static int IndexForUcs (int ucs)
337 for (int i = 0; i < ranges.Length; i++) {
338 GB18030Map m = ranges [i];
341 return count + ucs - start;
342 count += m.UStart - start;
344 skip = m.GStart == 0;
350 static int IndexForGbx (int gbx)
354 for (int i = 0; i < ranges.Length; i++) {
355 GB18030Map m = ranges [i];
359 return (int) (count + gbx - start);
360 count += m.GStart - start;