2005-08-04 Atsushi Enomoto <atsushi@ximian.com>
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / create-normalization-source.cs
1 //
2 // create-normalization-source.cs : creates normalization information table.
3 //
4 // Author:
5 //      Atsushi Enomoto  <atsushi@ximian.com>
6 //
7 // Copyright (C) 2005 Novell, Inc (http://www.novell.com)
8 //
9 // Permission is hereby granted, free of charge, to any person obtaining
10 // a copy of this software and associated documentation files (the
11 // "Software"), to deal in the Software without restriction, including
12 // without limitation the rights to use, copy, modify, merge, publish,
13 // distribute, sublicense, and/or sell copies of the Software, and to
14 // permit persons to whom the Software is furnished to do so, subject to
15 // the following conditions:
16 // 
17 // The above copyright notice and this permission notice shall be
18 // included in all copies or substantial portions of the Software.
19 // 
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
27 //
28
29 using System;
30 using System.Collections;
31 using System.Globalization;
32 using System.IO;
33
34 using NUtil = Mono.Globalization.Unicode.NormalizationTableUtil;
35
36 namespace Mono.Globalization.Unicode
37 {
38         internal class NormalizationCodeGenerator
39         {
40                 private int lineCount = 0;
41                 int singleCount = 1, multiCount = 1, propValueCount = 1;
42 //              int [] singleNorm = new int [550];
43 //              int [] multiNorm = new int [280];
44                 int [] prop = new int [char.MaxValue + 1];
45
46                 public const int NoNfd = 1;
47                 public const int NoNfkd = 2;
48                 public const int NoNfc = 4;
49                 public const int MaybeNfc = 8;
50                 public const int NoNfkc = 16;
51                 public const int MaybeNfkc = 32;
52                 public const int FullCompositionExclusion = 64;
53                 public const int IsUnsafe = 128;
54 //              public const int ExpandOnNfd = 256;
55 //              public const int ExpandOnNfc = 512;
56 //              public const int ExpandOnNfkd = 1024;
57 //              public const int ExpandOnNfkc = 2048;
58
59                 CharMappingComparer comparer;
60
61                 int mappedCharCount = 1;
62                 int [] mappedChars = new int [100];
63                 int [] mapIndex = new int [char.MaxValue + 1];
64
65                 ArrayList mappings = new ArrayList ();
66
67                 byte [] combining = new byte [0x20000];
68
69
70                 public static void Main ()
71                 {
72                         new NormalizationCodeGenerator ().Run ();
73                 }
74
75                 private void Run ()
76                 {
77                         comparer = new CharMappingComparer (this);
78                         try {
79                                 Parse ();
80                         } catch (Exception ex) {
81                                 throw new InvalidOperationException ("Internal error at line " + lineCount + " : " + ex);
82                         }
83                         RebaseUCD ();
84                         Serialize ();
85                         ProcessCombiningClass ();
86                 }
87
88                 TextWriter CSOut = Console.Out;
89                 TextWriter COut = TextWriter.Null;
90
91                 private void Serialize ()
92                 {
93                         SerializeNormalizationProps ();
94                         SerializeUCD ();
95                 }
96
97                 private void SerializeUCD ()
98                 {
99                         COut = new StreamWriter ("normalization-tables.h", true);
100
101                         // mappedChars
102                         COut.WriteLine ("static const guint32 mappedChars [] = {");
103                         CSOut.WriteLine ("static readonly int [] mappedCharsArr = new int [] {");
104                         DumpMapArray (mappedChars, mappedCharCount, false);
105                         COut.WriteLine ("0};");
106                         CSOut.WriteLine ("};");
107
108                         // charMapIndex
109                         COut.WriteLine ("static const guint16 charMapIndex [] = {");
110                         CSOut.WriteLine ("static readonly short [] charMapIndexArr = new short [] {");
111                         DumpMapArray (mapIndex, NUtil.MapCount, true);
112                         COut.WriteLine ("0};");
113                         CSOut.WriteLine ("};");
114
115                         short [] helperIndexes = new short [0x30000];
116
117                         // GetPrimaryCompositeHelperIndex ()
118                         int currentHead = 0;
119                         foreach (CharMapping m in mappings) {
120                                 if (mappedChars [m.MapIndex] == currentHead)
121                                         continue; // has the same head
122                                 if (!m.IsCanonical)
123                                         continue;
124                                 currentHead = mappedChars [m.MapIndex];
125                                 helperIndexes [currentHead] = (short) m.MapIndex;
126                         }
127
128                         helperIndexes = CodePointIndexer.CompressArray (
129                                 helperIndexes, typeof (short), NUtil.Helper)
130                                 as short [];
131
132                         COut.WriteLine ("static const guint16 helperIndex [] = {");
133                         CSOut.WriteLine ("static short [] helperIndexArr = new short [] {");
134                         for (int i = 0; i < helperIndexes.Length; i++) {
135                                 short value = helperIndexes [i];
136                                 if (value < 10)
137                                         CSOut.Write ("{0},", value);
138                                 else
139                                         CSOut.Write ("0x{0:X04},", value);
140                                 COut.Write ("{0},", value);
141                                 if (i % 16 == 15) {
142                                         CSOut.WriteLine (" // {0:X04}", NUtil.Helper.ToCodePoint (i - 15));
143                                         COut.WriteLine ();
144                                 }
145                         }
146                         COut.WriteLine ("0};");
147                         CSOut.WriteLine ("};");
148
149                         ushort [] mapIndexes = new ushort [char.MaxValue + 1];
150
151                         // GetPrimaryCompositeFromMapIndex ()
152                         int currentIndex = -1;
153                         foreach (CharMapping m in mappings) {
154                                 if (m.MapIndex == currentIndex)
155                                         continue;
156                                 if (!m.IsCanonical)
157                                         continue;
158                                 mapIndexes [m.MapIndex] = (ushort) m.CodePoint;
159                                 currentIndex = m.MapIndex;
160                         }
161
162                         mapIndexes = CodePointIndexer.CompressArray (mapIndexes, typeof (ushort), NUtil.Composite) as ushort [];
163
164                         COut.WriteLine ("static const guint16 mapIdxToComposite [] = {");
165                         CSOut.WriteLine ("static ushort [] mapIdxToCompositeArr = new ushort [] {");
166                         for (int i = 0; i < mapIndexes.Length; i++) {
167                                 ushort value = (ushort) mapIndexes [i];
168                                 if (value < 10)
169                                         CSOut.Write ("{0},", value);
170                                 else
171                                         CSOut.Write ("0x{0:X04},", value);
172                                 COut.Write ("{0},", value);
173                                 if (i % 16 == 15) {
174                                         CSOut.WriteLine (" // {0:X04}", NUtil.Composite.ToCodePoint (i - 15));
175                                         COut.WriteLine ();
176                                 }
177                         }
178                         COut.WriteLine ("0};");
179                         CSOut.WriteLine ("};");
180
181                         COut.Close ();
182                 }
183
184                 private void DumpMapArray (int [] array, int count, bool getCP)
185                 {
186                         if (array.Length < count)
187                                 throw new ArgumentOutOfRangeException ("count");
188                         for (int i = 0; i < count; i++) {
189                                 int value = array [i];
190                                 if (value < 10)
191                                         CSOut.Write ("{0}, ", value);
192                                 else
193                                         CSOut.Write ("0x{0:X}, ", value);
194                                 COut.Write ("{0},", value);
195                                 if (i % 16 == 15) {
196                                         int l = getCP ? NUtil.MapCP (i) : i;
197                                         CSOut.WriteLine ("// {0:X04}-{1:X04}", l - 15, l);
198                                         COut.WriteLine ();
199                                 }
200                         }
201                 }
202
203                 private void SerializeNormalizationProps ()
204                 {
205                         COut = new StreamWriter ("normalization-tables.h", false);
206
207                         /*
208                         CSOut.WriteLine ("static readonly int [] singleNorm = new int [] {");
209                         DumpArray (singleNorm, singleCount, false);
210                         CSOut.WriteLine ("};");
211                         CSOut.WriteLine ("static readonly int [] multiNorm = new int [] {");
212                         DumpArray (multiNorm, multiCount, false);
213                         CSOut.WriteLine ("};");
214                         */
215                         CSOut.WriteLine ("static readonly byte [] propsArr = new byte [] {");
216                         COut.WriteLine ("static const guint8 props [] = {");
217                         DumpPropArray (prop, NUtil.PropCount, true);
218                         CSOut.WriteLine ("};");
219                         COut.WriteLine ("0};");
220
221                         COut.Close ();
222                 }
223
224                 private void DumpPropArray (int [] array, int count, bool getCP)
225                 {
226                         if (array.Length < count)
227                                 throw new ArgumentOutOfRangeException ("count");
228                         for (int i = 0; i < count; i++) {
229                                 uint value = (uint) array [i];
230                                 if (value < 10)
231                                         CSOut.Write ("{0}, ", value);
232                                 else
233                                         CSOut.Write ("0x{0:X}, ", value);
234                                 COut.Write ("{0},", value);
235                                 if (i % 16 == 15) {
236                                         int l = getCP ? NUtil.PropCP (i) : i;
237                                         CSOut.WriteLine ("// {0:X04}-{1:X04}", l - 15, l);
238                                         COut.WriteLine ();
239                                 }
240                         }
241                 }
242
243                 private void RebaseUCD ()
244                 {
245                         mappings.Sort (comparer);
246                         // mappedChars[0] = 0. This assures that value 0 of
247                         // mapIndex means there is no mapping.
248                         int count = 1;
249                         int [] compressedMapping = new int [mappedCharCount];
250                         // Update map index.
251                         int [] newMapIndex = new int [mappings.Count];
252                         for (int mi = 0; mi < mappings.Count; mi++) {
253                                 CharMapping m = (CharMapping) mappings [mi];
254                                 if (mi > 0 && 0 == comparer.Compare (
255                                         mappings [mi - 1], mappings [mi])) {
256                                         newMapIndex [mi] = newMapIndex [mi - 1];
257                                         continue;
258                                 }
259                                 newMapIndex [mi] = count;
260                                 for (int i = m.MapIndex; mappedChars [i] != 0; i++)
261                                         compressedMapping [count++] = mappedChars [i];
262                                 compressedMapping [count++] = 0;
263                         }
264                         for (int mi = 0; mi < mappings.Count; mi++)
265                                 ((CharMapping) mappings [mi]).MapIndex = newMapIndex [mi];
266
267                         int [] compressedMapIndex = new int [mapIndex.Length];
268                         foreach (CharMapping m in mappings)
269                                 if (m.CodePoint <= char.MaxValue)
270                                         compressedMapIndex [NUtil.MapIdx (m.CodePoint)] = m.MapIndex;
271
272                         mappedChars = compressedMapping;
273                         mapIndex = compressedMapIndex;
274                         mappedCharCount = count;
275                 }
276
277                 private void Parse ()
278                 {
279                         ParseNormalizationProps ();
280                         ParseUCD ();
281                 }
282                 
283                 private void ParseUCD ()
284                 {
285                         lineCount = 0;
286                         TextReader reader = new StreamReader ("downloaded/UnicodeData.txt");
287                         while (reader.Peek () != -1) {
288                                 string line = reader.ReadLine ();
289                                 lineCount++;
290                                 int idx = line.IndexOf ('#');
291                                 if (idx >= 0)
292                                         line = line.Substring (0, idx);
293                                 if (line.Length == 0)
294                                         continue;
295                                 int n = 0;
296                                 while (Char.IsDigit (line [n]) || Char.IsLetter (line [n]))
297                                         n++;
298                                 int cp = int.Parse (line.Substring (0, n), NumberStyles.HexNumber);
299                                 // Windows does not handle surrogate characters.
300                                 if (cp >= 0x10000)
301                                         continue;
302
303                                 string [] values = line.Substring (n + 1).Split (';');
304                                 string canon = values [4];
305                                 string combiningCategory = canon.IndexOf ('>') < 0 ? "" : canon.Substring (1, canon.IndexOf ('>') - 1);
306                                 string mappedCharsValue = canon;
307                                 if (combiningCategory.Length > 0)
308                                         mappedCharsValue = canon.Substring (combiningCategory.Length + 2).Trim ();
309                                 if (mappedCharsValue.Length > 0) {
310                                         int start = mappedCharCount;
311                                         mappings.Add (new CharMapping (cp,
312                                                 mappedCharCount, 
313                                                 combiningCategory.Length == 0));
314                                         SetCanonProp (cp, -1, mappedCharCount);
315                                         foreach (string v in mappedCharsValue.Split (' '))
316                                                 AddMappedChars (cp,
317                                                         int.Parse (v, NumberStyles.HexNumber));
318                                         AddMappedChars (cp, 0);
319                                         // For canonical composite, set IsUnsafe
320                                         if (combiningCategory == "") {
321                                                 for (int ca = start; ca < mappedCharCount - 1; ca++)
322                                                         FillUnsafe (mappedChars [ca]);
323                                         }
324                                 }
325                         }
326                         if (reader != Console.In)
327                                 reader.Close ();
328                 }
329
330                 private void FillUnsafe (int i)
331                 {
332                         if (i < 0 || i > char.MaxValue)
333                                 return;
334                         if (0x3400 <= i && i <= 0x9FBB)
335                                 return;
336                         SetProp (i, -1, IsUnsafe);
337                 }
338
339                 private void AddMappedChars (int cp, int cv)
340                 {
341                         if (mappedCharCount == mappedChars.Length) {
342                                 int [] tmp = new int [mappedCharCount * 2];
343                                 Array.Copy (mappedChars, tmp, mappedCharCount);
344                                 mappedChars = tmp;
345                         }
346                         mappedChars [mappedCharCount++] = cv;
347                 }
348
349                 private void SetCanonProp (int cp, int cpEnd, int flag)
350                 {
351                         int idx = NUtil.MapIdx (cp);
352                         if (cpEnd < 0)
353                                 mapIndex [idx] = flag;
354                         else {
355                                 int idxEnd = NUtil.MapIdx (cpEnd);
356                                 for (int i = idx; i <= idxEnd; i++)
357                                         mapIndex [i] = flag;
358                         }
359                 }
360
361                 private void ParseNormalizationProps ()
362                 {
363                         lineCount = 0;
364                         TextReader reader = new StreamReader ("downloaded/DerivedNormalizationProps.txt");
365                         while (reader.Peek () != -1) {
366                                 string line = reader.ReadLine ();
367                                 lineCount++;
368                                 int idx = line.IndexOf ('#');
369                                 if (idx >= 0)
370                                         line = line.Substring (0, idx);
371                                 if (line.Length == 0)
372                                         continue;
373                                 int n = 0;
374                                 while (Char.IsDigit (line [n]) || Char.IsLetter (line [n]))
375                                         n++;
376                                 int cp = int.Parse (line.Substring (0, n), NumberStyles.HexNumber);
377                                 // Windows does not handle surrogate characters.
378                                 if (cp >= 0x10000)
379                                         continue;
380
381                                 int cpEnd = -1;
382                                 if (line [n] == '.' && line [n + 1] == '.')
383                                         cpEnd = int.Parse (line.Substring (n + 2, n), NumberStyles.HexNumber);
384                                 int nameStart = line.IndexOf (';') + 1;
385                                 int valueStart = line.IndexOf (';', nameStart) + 1;
386                                 string name = valueStart == 0 ? line.Substring (nameStart) :
387                                         line.Substring (nameStart, valueStart - nameStart - 1);
388                                 name = name.Trim ();
389                                 string values = valueStart > 0 ?
390                                         line.Substring (valueStart).Trim () : "";
391                                 switch (name) {
392                                 case "Full_Composition_Exclusion":
393                                         SetProp (cp, cpEnd, FullCompositionExclusion);
394                                         break;
395                                 case "NFD_QC":
396                                         if (cp != 0xAC00) // Hangul Syllables are computed
397                                                 SetProp (cp, cpEnd, NoNfd);
398                                         break;
399                                 case "NFC_QC":
400                                         SetProp (cp, cpEnd, (values == "M") ?
401                                                 MaybeNfc :NoNfc);
402                                         break;
403                                 case "NFKD_QC":
404                                         if (cp != 0xAC00) // Hangul Syllables are computed
405                                                 SetProp (cp, cpEnd, NoNfkd);
406                                         break;
407                                 case "NFKC_QC":
408                                         SetProp (cp, cpEnd, (values == "M") ?
409                                                 MaybeNfkc :NoNfkc);
410                                         break;
411                                 /*
412                                 case "Expands_On_NFD":
413                                         if (cp != 0xAC00) // Hangul Syllables are computed
414                                                 SetProp (cp, cpEnd, ExpandOnNfd);
415                                         break;
416                                 case "Expands_On_NFC":
417                                         SetProp (cp, cpEnd, ExpandOnNfc);
418                                         break;
419                                 case "Expands_On_NFKD":
420                                         if (cp != 0xAC00) // Hangul Syllables are computed
421                                                 SetProp (cp, cpEnd, ExpandOnNfkd);
422                                         break;
423                                 case "Expands_On_NFKC":
424                                         SetProp (cp, cpEnd, ExpandOnNfkc);
425                                         break;
426                                 */
427                                 /*
428                                 case "FC_NFKC":
429                                         int v1 = 0, v2 = 0, v3 = 0, v4 = 0;
430                                         foreach (string s in values.Split (' ')) {
431                                                 if (s.Trim ().Length == 0)
432                                                         continue;
433                                                 int v = int.Parse (s, NumberStyles.HexNumber);
434                                                 if (v1 == 0)
435                                                         v1 = v;
436                                                 else if (v2 == 0)
437                                                         v2 = v;
438                                                 else if (v3 == 0)
439                                                         v3 = v;
440                                                 else if (v4 == 0)
441                                                         v4 = v;
442                                                 else
443                                                         throw new NotSupportedException (String.Format ("more than 4 values in FC_NFKC: {0:x}", cp));
444                                         }
445                                         SetNFKC (cp, cpEnd, v1, v2, v3, v4);
446                                         break;
447                                 */
448                                 }
449                         }
450                         reader.Close ();
451                 }
452
453                 private void SetProp (int cp, int cpEnd, int flag)
454                 {
455                         int idx = NUtil.PropIdx (cp);
456                         if (idx == 0)
457                                 throw new Exception (String.Format ("Codepoint {0:X04} should be included in the indexer.", cp));
458                         if (cpEnd < 0)
459                                 prop [idx] |= flag;
460                         else {
461                                 int idxEnd = NUtil.PropIdx (cpEnd);
462                                 for (int i = idx; i <= idxEnd; i++)
463                                         prop [i] |= flag;
464                         }
465                 }
466
467                 /*
468                 private void SetNFKC (int cp, int cpEnd, int v1, int v2, int v3, int v4)
469                 {
470                         if (v2 == 0) {
471                                 int idx = -1;
472                                 for (int i = 0; i < singleCount; i++)
473                                         if (singleNorm [i] == v1) {
474                                                 idx = i;
475                                                 break;
476                                         }
477                                 if (idx < 0) {
478                                         if (singleNorm.Length == singleCount) {
479                                                 int [] tmp = new int [singleCount << 1];
480                                                 Array.Copy (singleNorm, tmp, singleCount);
481                                                 singleNorm = tmp;
482                                                 idx = singleCount;
483                                         }
484                                         singleNorm [singleCount++] = v1;
485                                 }
486                                 SetProp (cp, cpEnd, idx << 16);
487                         } else {
488                                 if (multiNorm.Length == multiCount) {
489                                         int [] tmp = new int [multiCount << 1];
490                                         Array.Copy (multiNorm, tmp, multiCount);
491                                         multiNorm = tmp;
492                                 }
493                                 SetProp (cp, cpEnd,
494                                         (int) ((multiCount << 16) | 0xF0000000));
495                                 multiNorm [multiCount++] = v1;
496                                 multiNorm [multiCount++] = v2;
497                                 multiNorm [multiCount++] = v3;
498                                 multiNorm [multiCount++] = v4;
499                         }
500                 }
501                 */
502
503                 class CharMapping
504                 {
505                         public CharMapping (int cp, int mapIndex, bool isCanonical)
506                         {
507                                 MapIndex = mapIndex;
508                                 CodePoint = cp;
509                                 IsCanonical = isCanonical;
510                         }
511
512                         public int MapIndex;
513                         public readonly int CodePoint;
514                         public readonly bool IsCanonical;
515                 }
516
517                 class CharMappingComparer : IComparer
518                 {
519                         NormalizationCodeGenerator parent;
520
521                         public CharMappingComparer (NormalizationCodeGenerator g)
522                         {
523                                 parent = g;
524                         }
525
526                         // Note that this never considers IsCanonical
527                         public int Compare (object o1, object o2)
528                         {
529                                 CharMapping c1 = (CharMapping) o1;
530                                 CharMapping c2 = (CharMapping) o2;
531                                 return CompareArray (c1.MapIndex, c2.MapIndex);
532                         }
533
534                         // Note that this never considers IsCanonical
535                         public int CompareArray (int idx1, int idx2)
536                         {
537                                 for (int i = 0; ; i++) {
538                                         int l = parent.mappedChars [idx1 + i];
539                                         int r = parent.mappedChars [idx2 + i];
540                                         if (l != r)
541                                                 return l - r;
542                                         if (l == 0)
543                                                 return 0;
544                                 }
545                         }
546                 }
547
548                 private void ProcessCombiningClass ()
549                 {
550                         TextReader reader = new StreamReader ("downloaded/DerivedCombiningClass.txt");
551                         while (reader.Peek () != -1) {
552                                 string line = reader.ReadLine ();
553                                 lineCount++;
554                                 int idx = line.IndexOf ('#');
555                                 if (idx >= 0)
556                                         line = line.Substring (0, idx).Trim ();
557                                 if (line.Length == 0)
558                                         continue;
559                                 int n = 0;
560                                 while (Char.IsDigit (line [n]) || Char.IsLetter (line [n]))
561                                         n++;
562                                 int cp = int.Parse (line.Substring (0, n), NumberStyles.HexNumber);
563                                 // Windows does not handle surrogate characters.
564                                 if (cp >= 0x10000)
565                                         continue;
566
567                                 int cpEnd = -1;
568                                 if (line [n] == '.' && line [n + 1] == '.')
569                                         cpEnd = int.Parse (line.Substring (n + 2, n), NumberStyles.HexNumber);
570                                 int nameStart = line.IndexOf (';') + 1;
571                                 int valueStart = line.IndexOf (';', nameStart) + 1;
572                                 string val = valueStart == 0 ? line.Substring (nameStart) :
573                                         line.Substring (nameStart, valueStart - nameStart - 1);
574                                 SetCombiningProp (cp, cpEnd, short.Parse (val));
575                         }
576
577                         reader.Close ();
578
579                         byte [] ret = (byte []) CodePointIndexer.CompressArray (
580                                 combining, typeof (byte), NUtil.Combining);
581
582                         COut = new StreamWriter ("normalization-tables.h", true);
583
584                         COut.WriteLine ("static const guint8 combiningClass [] = {");
585                         CSOut.WriteLine ("public static byte [] combiningClassArr = new byte [] {");
586                         for (int i = 0; i < ret.Length; i++) {
587                                 byte value = ret [i];
588                                 if (value < 10)
589                                         CSOut.Write ("{0},", value);
590                                 else
591                                         CSOut.Write ("0x{0:X02},", value);
592                                 COut.Write ("{0},", value);
593                                 if (i % 16 == 15) {
594                                         CSOut.WriteLine (" // {0:X04}", NUtil.Combining.ToCodePoint (i - 15));
595                                         COut.WriteLine ();
596                                 }
597                         }
598                         CSOut.WriteLine ("};");
599                         COut.WriteLine ("0};");
600
601                         COut.Close ();
602                 }
603
604                 private void SetCombiningProp (int cp, int cpEnd, short val)
605                 {
606                         if (val == 0)
607                                 return;
608                         if (cpEnd < 0)
609                                 combining [cp] = (byte) val;
610                         else
611                                 for (int i = cp; i <= cpEnd; i++)
612                                         combining [i] = (byte) val;
613                 }
614         }
615 }
616