2 // create-normalization-source.cs : creates normalization information table.
5 // Atsushi Enomoto <atsushi@ximian.com>
7 // Copyright (C) 2005 Novell, Inc (http://www.novell.com)
9 // Permission is hereby granted, free of charge, to any person obtaining
10 // a copy of this software and associated documentation files (the
11 // "Software"), to deal in the Software without restriction, including
12 // without limitation the rights to use, copy, modify, merge, publish,
13 // distribute, sublicense, and/or sell copies of the Software, and to
14 // permit persons to whom the Software is furnished to do so, subject to
15 // the following conditions:
17 // The above copyright notice and this permission notice shall be
18 // included in all copies or substantial portions of the Software.
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 using System.Collections;
31 using System.Globalization;
34 using NUtil = Mono.Globalization.Unicode.NormalizationTableUtil;
36 namespace Mono.Globalization.Unicode
38 internal class NormalizationCodeGenerator
40 private int lineCount = 0;
41 int singleCount = 1, multiCount = 1, propValueCount = 1;
42 // int [] singleNorm = new int [550];
43 // int [] multiNorm = new int [280];
44 int [] prop = new int [char.MaxValue + 1];
46 public const int NoNfd = 1;
47 public const int NoNfkd = 2;
48 public const int NoNfc = 4;
49 public const int MaybeNfc = 8;
50 public const int NoNfkc = 16;
51 public const int MaybeNfkc = 32;
52 public const int FullCompositionExclusion = 64;
53 public const int IsUnsafe = 128;
54 // public const int ExpandOnNfd = 256;
55 // public const int ExpandOnNfc = 512;
56 // public const int ExpandOnNfkd = 1024;
57 // public const int ExpandOnNfkc = 2048;
59 CharMappingComparer comparer;
61 int mappedCharCount = 1;
62 int [] mappedChars = new int [100];
63 int [] mapIndex = new int [char.MaxValue + 1];
65 ArrayList mappings = new ArrayList ();
67 byte [] combining = new byte [0x20000];
70 public static void Main ()
72 new NormalizationCodeGenerator ().Run ();
77 comparer = new CharMappingComparer (this);
80 } catch (Exception ex) {
81 throw new InvalidOperationException ("Internal error at line " + lineCount + " : " + ex);
85 ProcessCombiningClass ();
88 TextWriter CSOut = Console.Out;
89 TextWriter COut = TextWriter.Null;
91 private void Serialize ()
93 SerializeNormalizationProps ();
97 private void SerializeUCD ()
99 COut = new StreamWriter ("normalization-tables.h", true);
102 COut.WriteLine ("static const guint32 mappedChars [] = {");
103 CSOut.WriteLine ("static readonly int [] mappedCharsArr = new int [] {");
104 DumpMapArray (mappedChars, mappedCharCount, false);
105 COut.WriteLine ("0};");
106 CSOut.WriteLine ("};");
109 COut.WriteLine ("static const guint16 charMapIndex [] = {");
110 CSOut.WriteLine ("static readonly short [] charMapIndexArr = new short [] {");
111 DumpMapArray (mapIndex, NUtil.MapCount, true);
112 COut.WriteLine ("0};");
113 CSOut.WriteLine ("};");
115 short [] helperIndexes = new short [0x30000];
117 // GetPrimaryCompositeHelperIndex ()
119 foreach (CharMapping m in mappings) {
120 if (mappedChars [m.MapIndex] == currentHead)
121 continue; // has the same head
124 currentHead = mappedChars [m.MapIndex];
125 helperIndexes [currentHead] = (short) m.MapIndex;
128 helperIndexes = CodePointIndexer.CompressArray (
129 helperIndexes, typeof (short), NUtil.Helper)
132 COut.WriteLine ("static const guint16 helperIndex [] = {");
133 CSOut.WriteLine ("static short [] helperIndexArr = new short [] {");
134 for (int i = 0; i < helperIndexes.Length; i++) {
135 short value = helperIndexes [i];
137 CSOut.Write ("{0},", value);
139 CSOut.Write ("0x{0:X04},", value);
140 COut.Write ("{0},", value);
142 CSOut.WriteLine (" // {0:X04}", NUtil.Helper.ToCodePoint (i - 15));
146 COut.WriteLine ("0};");
147 CSOut.WriteLine ("};");
149 ushort [] mapIndexes = new ushort [char.MaxValue + 1];
151 // GetPrimaryCompositeFromMapIndex ()
152 int currentIndex = -1;
153 foreach (CharMapping m in mappings) {
154 if (m.MapIndex == currentIndex)
158 mapIndexes [m.MapIndex] = (ushort) m.CodePoint;
159 currentIndex = m.MapIndex;
162 mapIndexes = CodePointIndexer.CompressArray (mapIndexes, typeof (ushort), NUtil.Composite) as ushort [];
164 COut.WriteLine ("static const guint16 mapIdxToComposite [] = {");
165 CSOut.WriteLine ("static ushort [] mapIdxToCompositeArr = new ushort [] {");
166 for (int i = 0; i < mapIndexes.Length; i++) {
167 ushort value = (ushort) mapIndexes [i];
169 CSOut.Write ("{0},", value);
171 CSOut.Write ("0x{0:X04},", value);
172 COut.Write ("{0},", value);
174 CSOut.WriteLine (" // {0:X04}", NUtil.Composite.ToCodePoint (i - 15));
178 COut.WriteLine ("0};");
179 CSOut.WriteLine ("};");
184 private void DumpMapArray (int [] array, int count, bool getCP)
186 if (array.Length < count)
187 throw new ArgumentOutOfRangeException ("count");
188 for (int i = 0; i < count; i++) {
189 int value = array [i];
191 CSOut.Write ("{0}, ", value);
193 CSOut.Write ("0x{0:X}, ", value);
194 COut.Write ("{0},", value);
196 int l = getCP ? NUtil.MapCP (i) : i;
197 CSOut.WriteLine ("// {0:X04}-{1:X04}", l - 15, l);
203 private void SerializeNormalizationProps ()
205 COut = new StreamWriter ("normalization-tables.h", false);
208 CSOut.WriteLine ("static readonly int [] singleNorm = new int [] {");
209 DumpArray (singleNorm, singleCount, false);
210 CSOut.WriteLine ("};");
211 CSOut.WriteLine ("static readonly int [] multiNorm = new int [] {");
212 DumpArray (multiNorm, multiCount, false);
213 CSOut.WriteLine ("};");
215 CSOut.WriteLine ("static readonly byte [] propsArr = new byte [] {");
216 COut.WriteLine ("static const guint8 props [] = {");
217 DumpPropArray (prop, NUtil.PropCount, true);
218 CSOut.WriteLine ("};");
219 COut.WriteLine ("0};");
224 private void DumpPropArray (int [] array, int count, bool getCP)
226 if (array.Length < count)
227 throw new ArgumentOutOfRangeException ("count");
228 for (int i = 0; i < count; i++) {
229 uint value = (uint) array [i];
231 CSOut.Write ("{0}, ", value);
233 CSOut.Write ("0x{0:X}, ", value);
234 COut.Write ("{0},", value);
236 int l = getCP ? NUtil.PropCP (i) : i;
237 CSOut.WriteLine ("// {0:X04}-{1:X04}", l - 15, l);
243 private void RebaseUCD ()
245 mappings.Sort (comparer);
246 // mappedChars[0] = 0. This assures that value 0 of
247 // mapIndex means there is no mapping.
249 int [] compressedMapping = new int [mappedCharCount];
251 int [] newMapIndex = new int [mappings.Count];
252 for (int mi = 0; mi < mappings.Count; mi++) {
253 CharMapping m = (CharMapping) mappings [mi];
254 if (mi > 0 && 0 == comparer.Compare (
255 mappings [mi - 1], mappings [mi])) {
256 newMapIndex [mi] = newMapIndex [mi - 1];
259 newMapIndex [mi] = count;
260 for (int i = m.MapIndex; mappedChars [i] != 0; i++)
261 compressedMapping [count++] = mappedChars [i];
262 compressedMapping [count++] = 0;
264 for (int mi = 0; mi < mappings.Count; mi++)
265 ((CharMapping) mappings [mi]).MapIndex = newMapIndex [mi];
267 int [] compressedMapIndex = new int [mapIndex.Length];
268 foreach (CharMapping m in mappings)
269 if (m.CodePoint <= char.MaxValue)
270 compressedMapIndex [NUtil.MapIdx (m.CodePoint)] = m.MapIndex;
272 mappedChars = compressedMapping;
273 mapIndex = compressedMapIndex;
274 mappedCharCount = count;
277 private void Parse ()
279 ParseNormalizationProps ();
283 private void ParseUCD ()
286 TextReader reader = new StreamReader ("downloaded/UnicodeData.txt");
287 while (reader.Peek () != -1) {
288 string line = reader.ReadLine ();
290 int idx = line.IndexOf ('#');
292 line = line.Substring (0, idx);
293 if (line.Length == 0)
296 while (Char.IsDigit (line [n]) || Char.IsLetter (line [n]))
298 int cp = int.Parse (line.Substring (0, n), NumberStyles.HexNumber);
299 // Windows does not handle surrogate characters.
303 string [] values = line.Substring (n + 1).Split (';');
304 string canon = values [4];
305 string combiningCategory = canon.IndexOf ('>') < 0 ? "" : canon.Substring (1, canon.IndexOf ('>') - 1);
306 string mappedCharsValue = canon;
307 if (combiningCategory.Length > 0)
308 mappedCharsValue = canon.Substring (combiningCategory.Length + 2).Trim ();
309 if (mappedCharsValue.Length > 0) {
310 int start = mappedCharCount;
311 mappings.Add (new CharMapping (cp,
313 combiningCategory.Length == 0));
314 SetCanonProp (cp, -1, mappedCharCount);
315 foreach (string v in mappedCharsValue.Split (' '))
317 int.Parse (v, NumberStyles.HexNumber));
318 AddMappedChars (cp, 0);
319 // For canonical composite, set IsUnsafe
320 if (combiningCategory == "") {
321 for (int ca = start; ca < mappedCharCount - 1; ca++)
322 FillUnsafe (mappedChars [ca]);
326 if (reader != Console.In)
330 private void FillUnsafe (int i)
332 if (i < 0 || i > char.MaxValue)
334 if (0x3400 <= i && i <= 0x9FBB)
336 SetProp (i, -1, IsUnsafe);
339 private void AddMappedChars (int cp, int cv)
341 if (mappedCharCount == mappedChars.Length) {
342 int [] tmp = new int [mappedCharCount * 2];
343 Array.Copy (mappedChars, tmp, mappedCharCount);
346 mappedChars [mappedCharCount++] = cv;
349 private void SetCanonProp (int cp, int cpEnd, int flag)
351 int idx = NUtil.MapIdx (cp);
353 mapIndex [idx] = flag;
355 int idxEnd = NUtil.MapIdx (cpEnd);
356 for (int i = idx; i <= idxEnd; i++)
361 private void ParseNormalizationProps ()
364 TextReader reader = new StreamReader ("downloaded/DerivedNormalizationProps.txt");
365 while (reader.Peek () != -1) {
366 string line = reader.ReadLine ();
368 int idx = line.IndexOf ('#');
370 line = line.Substring (0, idx);
371 if (line.Length == 0)
374 while (Char.IsDigit (line [n]) || Char.IsLetter (line [n]))
376 int cp = int.Parse (line.Substring (0, n), NumberStyles.HexNumber);
377 // Windows does not handle surrogate characters.
382 if (line [n] == '.' && line [n + 1] == '.')
383 cpEnd = int.Parse (line.Substring (n + 2, n), NumberStyles.HexNumber);
384 int nameStart = line.IndexOf (';') + 1;
385 int valueStart = line.IndexOf (';', nameStart) + 1;
386 string name = valueStart == 0 ? line.Substring (nameStart) :
387 line.Substring (nameStart, valueStart - nameStart - 1);
389 string values = valueStart > 0 ?
390 line.Substring (valueStart).Trim () : "";
392 case "Full_Composition_Exclusion":
393 SetProp (cp, cpEnd, FullCompositionExclusion);
396 if (cp != 0xAC00) // Hangul Syllables are computed
397 SetProp (cp, cpEnd, NoNfd);
400 SetProp (cp, cpEnd, (values == "M") ?
404 if (cp != 0xAC00) // Hangul Syllables are computed
405 SetProp (cp, cpEnd, NoNfkd);
408 SetProp (cp, cpEnd, (values == "M") ?
412 case "Expands_On_NFD":
413 if (cp != 0xAC00) // Hangul Syllables are computed
414 SetProp (cp, cpEnd, ExpandOnNfd);
416 case "Expands_On_NFC":
417 SetProp (cp, cpEnd, ExpandOnNfc);
419 case "Expands_On_NFKD":
420 if (cp != 0xAC00) // Hangul Syllables are computed
421 SetProp (cp, cpEnd, ExpandOnNfkd);
423 case "Expands_On_NFKC":
424 SetProp (cp, cpEnd, ExpandOnNfkc);
429 int v1 = 0, v2 = 0, v3 = 0, v4 = 0;
430 foreach (string s in values.Split (' ')) {
431 if (s.Trim ().Length == 0)
433 int v = int.Parse (s, NumberStyles.HexNumber);
443 throw new NotSupportedException (String.Format ("more than 4 values in FC_NFKC: {0:x}", cp));
445 SetNFKC (cp, cpEnd, v1, v2, v3, v4);
453 private void SetProp (int cp, int cpEnd, int flag)
455 int idx = NUtil.PropIdx (cp);
457 throw new Exception (String.Format ("Codepoint {0:X04} should be included in the indexer.", cp));
461 int idxEnd = NUtil.PropIdx (cpEnd);
462 for (int i = idx; i <= idxEnd; i++)
468 private void SetNFKC (int cp, int cpEnd, int v1, int v2, int v3, int v4)
472 for (int i = 0; i < singleCount; i++)
473 if (singleNorm [i] == v1) {
478 if (singleNorm.Length == singleCount) {
479 int [] tmp = new int [singleCount << 1];
480 Array.Copy (singleNorm, tmp, singleCount);
484 singleNorm [singleCount++] = v1;
486 SetProp (cp, cpEnd, idx << 16);
488 if (multiNorm.Length == multiCount) {
489 int [] tmp = new int [multiCount << 1];
490 Array.Copy (multiNorm, tmp, multiCount);
494 (int) ((multiCount << 16) | 0xF0000000));
495 multiNorm [multiCount++] = v1;
496 multiNorm [multiCount++] = v2;
497 multiNorm [multiCount++] = v3;
498 multiNorm [multiCount++] = v4;
505 public CharMapping (int cp, int mapIndex, bool isCanonical)
509 IsCanonical = isCanonical;
513 public readonly int CodePoint;
514 public readonly bool IsCanonical;
517 class CharMappingComparer : IComparer
519 NormalizationCodeGenerator parent;
521 public CharMappingComparer (NormalizationCodeGenerator g)
526 // Note that this never considers IsCanonical
527 public int Compare (object o1, object o2)
529 CharMapping c1 = (CharMapping) o1;
530 CharMapping c2 = (CharMapping) o2;
531 return CompareArray (c1.MapIndex, c2.MapIndex);
534 // Note that this never considers IsCanonical
535 public int CompareArray (int idx1, int idx2)
537 for (int i = 0; ; i++) {
538 int l = parent.mappedChars [idx1 + i];
539 int r = parent.mappedChars [idx2 + i];
548 private void ProcessCombiningClass ()
550 TextReader reader = new StreamReader ("downloaded/DerivedCombiningClass.txt");
551 while (reader.Peek () != -1) {
552 string line = reader.ReadLine ();
554 int idx = line.IndexOf ('#');
556 line = line.Substring (0, idx).Trim ();
557 if (line.Length == 0)
560 while (Char.IsDigit (line [n]) || Char.IsLetter (line [n]))
562 int cp = int.Parse (line.Substring (0, n), NumberStyles.HexNumber);
563 // Windows does not handle surrogate characters.
568 if (line [n] == '.' && line [n + 1] == '.')
569 cpEnd = int.Parse (line.Substring (n + 2, n), NumberStyles.HexNumber);
570 int nameStart = line.IndexOf (';') + 1;
571 int valueStart = line.IndexOf (';', nameStart) + 1;
572 string val = valueStart == 0 ? line.Substring (nameStart) :
573 line.Substring (nameStart, valueStart - nameStart - 1);
574 SetCombiningProp (cp, cpEnd, short.Parse (val));
579 byte [] ret = (byte []) CodePointIndexer.CompressArray (
580 combining, typeof (byte), NUtil.Combining);
582 COut = new StreamWriter ("normalization-tables.h", true);
584 COut.WriteLine ("static const guint8 combiningClass [] = {");
585 CSOut.WriteLine ("public static byte [] combiningClassArr = new byte [] {");
586 for (int i = 0; i < ret.Length; i++) {
587 byte value = ret [i];
589 CSOut.Write ("{0},", value);
591 CSOut.Write ("0x{0:X02},", value);
592 COut.Write ("{0},", value);
594 CSOut.WriteLine (" // {0:X04}", NUtil.Combining.ToCodePoint (i - 15));
598 CSOut.WriteLine ("};");
599 COut.WriteLine ("0};");
604 private void SetCombiningProp (int cp, int cpEnd, short val)
609 combining [cp] = (byte) val;
611 for (int i = cp; i <= cpEnd; i++)
612 combining [i] = (byte) val;