2 // create-category-table.cs - Generate Unicode category tables for the
6 // Damien Diederen (dd@crosstwine.com)
8 // Permission is hereby granted, free of charge, to any person obtaining
9 // a copy of this software and associated documentation files (the
10 // "Software"), to deal in the Software without restriction, including
11 // without limitation the rights to use, copy, modify, merge, publish,
12 // distribute, sublicense, and/or sell copies of the Software, and to
13 // permit persons to whom the Software is furnished to do so, subject to
14 // the following conditions:
16 // The above copyright notice and this permission notice shall be
17 // included in all copies or substantial portions of the Software.
19 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
20 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
23 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
24 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
25 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 // create-category-table.exe --dump <dump-file>
29 // create-category-table.exe --encode <dump-file> <data-name> <h-file>
32 // Dump, encode and generate (partially) bi-level category tables
33 // containing variants of the Unicode category database.
35 // With --dump <dump-file>, dump the contents of the hosting
36 // runtime's database to <dump-file> in an easily-parseable ASCII
39 // With --encode <dump-file> <data-name> <h-file>, load a previously
40 // generated dump and create the corresponding header file
41 // containing two static C arrays: '<data_name>' and
42 // '<data_name>_astral_index'.
44 // The main table is linear for code points in the U+0000..U+FFFF
45 // range; the 'astral_index' portion is necessary to select pages
46 // related to code points in the astral planes:
48 // data [(astral_index [(cp - 0x10000) >> 8] << 8) + (cp & 0xff)]
51 using System.Diagnostics;
53 using System.Globalization;
55 // No .Generics mean this program can be compiled and run on v1.1
56 // after updating MaxCodePoint and removing Char.ConvertFromUtf32
58 using System.Collections;
60 namespace Mono.Globalization.Unicode
68 public class PagedTableEncoder
70 public interface IData
72 ushort this [int index]
83 public class ArrayData : PagedTableEncoder.IData
85 public ArrayData (Array data)
90 public ushort this [int index]
93 IConvertible value = (IConvertible)data.GetValue (index);
95 return value.ToUInt16 (null);
111 public Page (int first_base, int number, ushort [] data)
113 this.first_base = first_base;
114 this.number = number;
118 public void AddIndexEntry (IndexEntry index_entry)
120 index_entries.Add (index_entry);
123 public bool Contains (ushort [] data)
125 for (int i = 0; i < data.Length; i++)
126 if (this.data [i] != data [i])
131 public readonly int first_base;
133 public readonly int number;
135 public readonly ushort [] data;
137 public IList index_entries = new ArrayList (2);
142 public IndexEntry (string key, int start, int end, Page page)
150 public readonly string key;
152 public readonly int start;
154 public readonly int end;
156 public readonly Page page;
159 class IndexEntriesComparer : IComparer
161 public int Compare (object x, object y)
163 return ((IndexEntry) x).start - ((IndexEntry) y).start;
169 public Index (string name, IList entries)
172 this.entries = entries;
175 public readonly string name;
177 public readonly IList entries;
180 public PagedTableEncoder (int page_bits,
184 string no_astral_symbol)
186 Debug.Assert (page_bits >= 4);
187 Debug.Assert (value_bits == 8 || value_bits == 16);
188 Debug.Assert (index_bits == 16 || index_bits == 32);
190 this.page_size = 1 << page_bits;
191 this.value_bits = value_bits;
192 this.index_bits = index_bits;
193 this.flat_bmp = flat_bmp;
194 this.no_astral_symbol = no_astral_symbol;
197 public void Process (IData data, string index_name)
199 int end = data.Count;
201 Debug.Assert (!flat_bmp || pages.Count == 0);
202 Debug.Assert ((end & (page_size - 1)) == 0);
204 IList entries = new ArrayList ();
206 for (int page_base = 0; page_base < end; page_base += page_size) {
207 ushort [] page_data = new ushort [page_size];
209 for (int i = 0; i < page_size; i++) {
210 ushort v = data [page_base + i];
215 bool indexed = IsIndexed (page_base);
216 Page page = GetPageForData (page_base, page_data, indexed);
219 IndexEntry index_entry = new IndexEntry (index_name, page_base,
220 page_base + page_size, page);
221 page.AddIndexEntry (index_entry);
222 entries.Add (index_entry);
226 indices.Add (new Index (index_name, entries));
229 Page GetPageForData (int page_base, ushort [] data, bool indexed)
232 // Are we in a hurry?
233 foreach (Page page in pages) {
234 if (page.Contains (data))
239 Page new_page = new Page (page_base, pages.Count, data);
240 pages.Add (new_page);
244 bool IsIndexed (int page_base)
246 return !flat_bmp || page_base > Char.MaxValue;
249 public void WriteDefinitions (Language lang, string name, TextWriter w)
251 WriteHeaderComment (w);
252 WriteDataTable (lang, name, w);
254 foreach (Index index in indices) {
256 WriteIndexTable (lang, index, name + '_' + index.name, w);
260 void WriteHeaderComment (TextWriter w)
262 int packed_size = pages.Count * page_size * value_bits / 8;
263 int total_size = packed_size;
266 w.WriteLine (" * Value bits: {0}, Page size: {1}", value_bits, page_size);
267 w.WriteLine (" * Packed table: {0} bytes", packed_size);
269 foreach (Index index in indices) {
270 int index_size = index.entries.Count * 2;
272 w.WriteLine (" * Index {0}: {1} bytes", index.name, index_size);
274 total_size += index_size;
277 w.WriteLine (" * Total: {0} bytes", total_size);
281 public string CompoundKey (ArrayList keys)
283 string [] key_array = (string []) keys.ToArray (typeof (string));
284 Array.Sort (key_array);
285 return string.Join (", ", key_array);
288 public IList CollapseByIndex (IList index_entries)
290 if (index_entries.Count == 0)
291 return index_entries;
293 ArrayList entries = new ArrayList (index_entries);
294 // The comparer is required for a stable sort.
295 entries.Sort (new IndexEntriesComparer ());
297 IndexEntry first = (IndexEntry) entries [0];
298 ArrayList keys = new ArrayList ();
299 keys.Add (first.key);
300 int start = first.start;
302 Page page = first.page;
303 IList collapsed = new ArrayList ();
305 for (int i = 1; i < entries.Count; i ++) {
306 IndexEntry ie = (IndexEntry) entries [i];
308 if (ie.start == start && ie.end == end)
311 collapsed.Add (new IndexEntry (CompoundKey (keys), start, end, page));
313 keys = new ArrayList ();
321 collapsed.Add (new IndexEntry (CompoundKey (keys), start, end, page));
325 public IList CollapseByRange (IList entries)
327 if (entries.Count == 0)
330 IndexEntry first = (IndexEntry) entries [0];
331 string key = first.key;
332 int start = first.start;
334 Page page = first.page;
335 IList collapsed = new ArrayList ();
337 for (int i = 1; i < entries.Count; i++) {
338 IndexEntry ie = (IndexEntry) entries [i];
340 if (ie.start == end && ie.key == key) {
343 collapsed.Add (new IndexEntry (key, start, end, page));
351 collapsed.Add (new IndexEntry (key, start, end, page));
355 public IList CollapseIndexEntries (IList index_entries)
357 return CollapseByRange (CollapseByIndex (index_entries));
360 void WriteDataTable (Language lang, string name, TextWriter w)
362 int n_entries = pages.Count * page_size;
364 if (lang == Language.C)
365 w.WriteLine ("static const guint{0} {1} [{2}] = ", value_bits, name, n_entries);
367 string type = value_bits == 8 ? "byte" : "ushort";
369 w.WriteLine ("static readonly {0} [] {1} = new {0} [{2}] ", type, name, n_entries);
372 string separator = TABLE_START;
373 bool has_ifndef = false;
374 foreach (Page page in pages) {
375 has_ifndef |= MaybeWriteIfndef (page, ref separator, w);
376 WritePageComment (page, ref separator, w);
378 for (int i = 0; i < page_size; i += 16) {
379 w.Write("{0}\t{1},{2},{3},{4},{5},{6},{7},{8}," +
380 "{9},{10},{11},{12},{13},{14},{15},{16}",
382 page.data[i + 0], page.data[i + 1],
383 page.data[i + 2], page.data[i + 3],
384 page.data[i + 4], page.data[i + 5],
385 page.data[i + 6], page.data[i + 7],
386 page.data[i + 8], page.data[i + 9],
387 page.data[i + 10], page.data[i + 11],
388 page.data[i + 12], page.data[i + 13],
389 page.data[i + 14], page.data[i + 15]);
391 separator = TABLE_CONT;
395 // Separator intentionally ignored.
397 w.Write ("{0}#endif", Environment.NewLine);
398 w.WriteLine (TABLE_END);
401 bool MaybeWriteIfndef (Page page, ref string separator, TextWriter w)
403 if (no_astral_symbol == null || page.first_base != Char.MaxValue + 1)
406 w.WriteLine ("{0}#ifndef {1}", Environment.NewLine, no_astral_symbol);
407 // Previous separator, but indented on the new line following the directive.
408 separator = "\t" + separator;
412 void WritePageComment (Page page, ref string separator, TextWriter w)
414 int uses = page.index_entries.Count;
415 IList index_entries = CollapseIndexEntries (page.index_entries);
417 if (uses == 0 || index_entries.Count == 1) {
418 w.Write ("{0}\t/* Page {1}, {2} {3}use{4}",
419 separator, page.number, uses,
420 flat_bmp ? "indirect " : "",
421 uses != 1 ? "s" : "");
423 if (index_entries.Count == 1) {
424 IndexEntry ie = (IndexEntry) index_entries [0];
426 w.WriteLine (": {0:X4}-{1:X4} ({2}) */", ie.start, ie.end - 1, ie.key);
430 w.Write ("{0}\t/*{1}\t * Page {2}, {3} indirect use{4}",
431 separator, Environment.NewLine, page.number, uses, uses != 1 ? "s" : "");
433 separator = ":" + Environment.NewLine + "\t *\t";
434 string next_separator = "," + Environment.NewLine + "\t *\t";
436 foreach (IndexEntry ie in index_entries) {
437 w.Write ("{0}{1:X4}-{2:X4} ({3})", separator, ie.start, ie.end - 1, ie.key);
438 separator = next_separator;
441 // Separator intentionally ignored.
442 w.WriteLine (Environment.NewLine + "\t */");
448 void WriteIndexTable (Language lang, Index index, string name, TextWriter w)
450 bool ifndef_around = flat_bmp && no_astral_symbol != null;
452 w.WriteLine ("#ifndef {0}", no_astral_symbol);
454 if (lang == Language.C)
455 w.WriteLine ("static const guint{0} {1} [{2}] = ", index_bits, name, index.entries.Count);
457 string type = value_bits == 16 ? "ushort" : "uint";
459 w.WriteLine ("static readonly {0} [] {1} = new {0} [{2}] ", type, name, index.entries.Count);
462 string separator = TABLE_START;
463 bool ifndef_inside = false;
464 foreach (IndexEntry ie in index.entries) {
465 int index_value = ie.page.number /* * page_size */;
467 Debug.Assert (index_value < (1 << index_bits));
470 ifndef_inside |= MaybeWriteIfndef (ie.page, ref separator, w);
472 w.WriteLine ("{0}\t/* {1:X4}-{2:X4}: page {3} */",
473 separator, ie.start, ie.end - 1, ie.page.number);
474 w.Write ("\t0x{0:X}", index_value);
476 separator = TABLE_CONT;
479 // Separator intentionally ignored.
481 w.Write ("{0}#endif", Environment.NewLine);
482 w.WriteLine (TABLE_END);
485 w.WriteLine ("#endif");
488 readonly int page_size;
490 readonly int value_bits;
492 readonly int index_bits;
494 readonly bool flat_bmp;
496 readonly string no_astral_symbol;
498 IList pages = new ArrayList ();
500 IList indices = new ArrayList ();
502 static readonly string TABLE_START = "{" + Environment.NewLine;
503 static readonly string TABLE_CONT = "," + Environment.NewLine;
504 static readonly string TABLE_END = Environment.NewLine + "};";
507 class CategoryTableGenerator {
508 const int MaxCodePoint = 0x10ffff;
510 public class HostUCData : PagedTableEncoder.IData
512 public ushort this [int index]
516 return (ushort) Char.GetUnicodeCategory ((char) index);
518 string s = Char.ConvertFromUtf32 (index);
520 return (ushort) Char.GetUnicodeCategory (s, 0);
528 return MaxCodePoint + 1;
533 public static void Dump (PagedTableEncoder.IData source, TextWriter w)
535 w.WriteLine ("{0}", source.Count);
537 for (int cp = 0; cp <= MaxCodePoint; cp++) {
538 byte cc = (byte) source [cp];
541 w.WriteLine ("{0} {1}", cp, cc);
545 public static PagedTableEncoder.IData ParseDump (TextReader r)
547 string line = r.ReadLine ();
548 int count = int.Parse (line);
549 byte [] data = new byte [count];
551 while ((line = r.ReadLine ()) != null) {
552 int n = line.IndexOf (' ');
553 int cp = int.Parse (line.Substring (0, n));
554 int cc = int.Parse (line.Substring (n + 1));
556 if (cp < 0 || cp >= data.Length)
557 throw new Exception (String.Format ("Invalid code point {0:X4}", cp));
559 if (cc < 0 || cc > (int)UnicodeCategory.OtherNotAssigned)
560 throw new Exception (String.Format ("Invalid category code {0}", cc));
563 throw new Exception (String.Format ("Duplicate code point {0:X4}", cp));
565 data [cp] = (byte)cc;
568 return new PagedTableEncoder.ArrayData (data);
571 public static void Encode (string dump_file, string data_name, string h_file)
573 PagedTableEncoder.IData data;
575 using (TextReader r = new StreamReader (dump_file))
576 data = ParseDump (r);
578 PagedTableEncoder pte = new PagedTableEncoder (8, 8, 16, true, "DISABLE_ASTRAL");
579 pte.Process (data, "astral_index");
581 using (TextWriter w = new StreamWriter (h_file)) {
583 w.WriteLine (" * The {0}* tables below are automatically generated", data_name);
584 w.WriteLine (" * by create-category-table(.cs), available in the mcs");
585 w.WriteLine (" * sources. DO NOT EDIT!");
589 pte.WriteDefinitions (Language.C, data_name, w);
593 public static void Main (string [] args)
595 for (int i = 0; i < args.Length; ) {
596 if (args [i] == "--dump") {
597 PagedTableEncoder.IData data = new HostUCData ();
599 using (TextWriter w = new StreamWriter (args [i + 1]))
603 } else if (args [i] == "--encode") {
604 Encode (args [i + 1], args [i + 2], args [i + 3]);
608 throw new Exception ("Unrecognized argument: " + args [i]);