5 // Atsushi Enomoto <atsushi@ximian.com>
7 // Copyright (C) 2008 Novell, Inc.
11 // Unicode table generator for eglib.
12 // Note that this code is only for Unicode 5.1.0 or earlier.
13 // (regarding character ranges)
16 // - lower-band (0000-FFFF) characters never has case mapping to higher-band
17 // characters. Hence, simple upper/lower mapping is divided into 16-bit and
22 using System.Collections.Generic;
23 using System.Globalization;
25 using System.Reflection;
27 namespace Mono.Globalization.Unicode
31 public static void Main (string [] args)
33 TextWriter w = Console.Out;
37 DO NOT MODIFY THIS FILE DIRECTLY.
39 This file is automatically generated by {0}.exe.
40 The source for this generator should be in Mono repository
41 (mcs/class/corlib/Mono.Globalization.Unicode directory).
44 #ifndef __UNICODE_DATA_H
45 #define __UNICODE_DATA_H
49 ", Assembly.GetEntryAssembly ().GetName ().Name);
50 var ud = new UnicodeData5_1_0 ();
51 var ucd = ud.ParseFile (args [0]);
52 var ucg = new UnicodeDataCodeGeneratorC5_1_0 (ud, w);
53 ucg.GenerateStructures ();
55 ucg.GenerateUnicodeCategoryListC (ucd);
57 ucg.GenerateSimpleCaseMappingListC (ucd);
59 ucg.GenerateSimpleTitlecaseMappingListC (ucd);
66 public class UnicodeData5_1_0 : UnicodeData
68 public override CodePointRange [] SimpleCases {
69 get { return simple_cases; }
72 public override CodePointRange [] CategoryRanges {
73 get { return category_ranges; }
76 static readonly CodePointRange [] simple_cases = {
77 new CodePointRange (0x0040, 0x0600),
78 new CodePointRange (0x1000, 0x10D0),
79 new CodePointRange (0x1D00, 0x2000),
80 new CodePointRange (0x2100, 0x21C0),
81 new CodePointRange (0x2480, 0x2500),
82 new CodePointRange (0x2C00, 0x2D80),
83 new CodePointRange (0xA640, 0xA7C0),
84 new CodePointRange (0xFF20, 0xFF80),
85 new CodePointRange (0x10400, 0x10480),
88 static readonly CodePointRange [] category_ranges = {
89 new CodePointRange (0x0000, 0x3400),
90 // 3400-4DB5: OtherLetter
91 new CodePointRange (0x4DC0, 0x4E00),
92 // 4E00-9FC3: OtherLetter
93 new CodePointRange (0xA000, 0xAA80),
94 // AC00-D7A3: OtherLetter
95 // D800-DFFF: OtherSurrogate
96 // E000-F8FF: OtherPrivateUse
97 new CodePointRange (0xF900, 0x10000),
98 new CodePointRange (0x10000, 0x104C0),
99 new CodePointRange (0x10800, 0x10A80),
100 new CodePointRange (0x12000, 0x12480),
101 new CodePointRange (0x1D000, 0x1D800),
102 new CodePointRange (0x1F000, 0x1F0C0),
103 // 20000-2A6D6 OtherLetter
104 new CodePointRange (0x2F800, 0x2FA40),
105 new CodePointRange (0xE0000, 0xE0200),
106 // F0000-FFFFD OtherPrivateUse
107 // 100000-10FFFD OtherPrivateUse
111 public abstract class UnicodeData
113 public abstract CodePointRange [] SimpleCases { get; }
115 public abstract CodePointRange [] CategoryRanges { get; }
117 public virtual UcdCharacterProperty [] ParseFile (string file)
119 var d = new List<KeyValuePair<int,UcdCharacterProperty>> ();
121 using (TextReader r = File.OpenText (file)) {
122 while (r.Peek () >= 0) {
123 var l = r.ReadLine ();
124 if (l.Length > 0 && l [0] != '#') {
126 d.Add (new KeyValuePair<int,UcdCharacterProperty> (u.Codepoint, u));
130 var list = new List<UcdCharacterProperty> ();
133 return list.ToArray ();
136 UcdCharacterProperty Parse (string line)
138 string [] tokens = line.Split (';');
139 string [] decomp = tokens [5].Length > 0 ? tokens [5].Split (' ') : null;
140 string decomp_type = decomp != null && decomp [0] [0] == '<' ? decomp [0] : null;
141 if (decomp_type != null) {
142 for (int i = 1; i < decomp.Length; i++)
143 decomp [i - 1] = decomp [i];
144 Array.Resize (ref decomp, decomp.Length - 1);
147 return new UcdCharacterProperty () {
148 Codepoint = int.Parse (tokens [0], NumberStyles.HexNumber),
150 Category = ParseUnicodeCategory (tokens [2]),
151 CanonicalCombiningClass = tokens [3].Length > 0 ? (byte?) byte.Parse (tokens [3]) : null,
152 BidiClass = tokens [4].Length > 0 ? (UcdBidiClass) Enum.Parse (typeof (UcdBidiClass), tokens [4]) : UcdBidiClass.None,
153 DecompositionType = decomp_type != null ? ParseDecompositionType (decomp_type) : UcdDecompositionType.None,
154 DecompositionMapping = decomp != null ? Array.ConvertAll<string,int> (decomp, dv => int.Parse (dv, NumberStyles.HexNumber)) : null,
155 DecimalDigitValue = tokens [6],
156 DigitValue = tokens [7],
157 NumericValue = tokens [8],
158 BidiMirrored = (tokens [9] == "Y"),
159 Unicode1Name = tokens [10],
160 IsoComment = tokens [11],
161 SimpleUppercaseMapping = tokens [12].Length > 0 ? int.Parse (tokens [12], NumberStyles.HexNumber) : 0,
162 SimpleLowercaseMapping = tokens [13].Length > 0 ? int.Parse (tokens [13], NumberStyles.HexNumber) : 0,
163 SimpleTitlecaseMapping = tokens [14].Length > 0 ? int.Parse (tokens [14], NumberStyles.HexNumber) : 0,
167 UcdDecompositionType ParseDecompositionType (string s)
171 return UcdDecompositionType.Font;
173 return UcdDecompositionType.NoBreak;
175 return UcdDecompositionType.Initial;
177 return UcdDecompositionType.Medial;
179 return UcdDecompositionType.Final;
181 return UcdDecompositionType.Isolated;
183 return UcdDecompositionType.Circle;
185 return UcdDecompositionType.Super;
187 return UcdDecompositionType.Sub;
189 return UcdDecompositionType.Vertical;
191 return UcdDecompositionType.Wide;
193 return UcdDecompositionType.Narrow;
195 return UcdDecompositionType.Small;
197 return UcdDecompositionType.Square;
199 return UcdDecompositionType.Fraction;
201 return UcdDecompositionType.Compat;
203 throw new ArgumentException (String.Format ("Unexpected decomposition type '{0}'", s));
206 UnicodeCategory ParseUnicodeCategory (string s)
210 return UnicodeCategory.UppercaseLetter;
212 return UnicodeCategory.LowercaseLetter;
214 return UnicodeCategory.TitlecaseLetter;
216 return UnicodeCategory.ModifierLetter;
218 return UnicodeCategory.OtherLetter;
220 return UnicodeCategory.NonSpacingMark;
222 return UnicodeCategory.SpacingCombiningMark;
224 return UnicodeCategory.EnclosingMark;
226 return UnicodeCategory.DecimalDigitNumber;
228 return UnicodeCategory.LetterNumber;
230 return UnicodeCategory.OtherNumber;
232 return UnicodeCategory.ConnectorPunctuation;
234 return UnicodeCategory.DashPunctuation;
236 return UnicodeCategory.OpenPunctuation;
238 return UnicodeCategory.ClosePunctuation;
240 return UnicodeCategory.InitialQuotePunctuation;
242 return UnicodeCategory.FinalQuotePunctuation;
244 return UnicodeCategory.OtherPunctuation;
246 return UnicodeCategory.MathSymbol;
248 return UnicodeCategory.CurrencySymbol;
250 return UnicodeCategory.ModifierSymbol;
252 return UnicodeCategory.OtherSymbol;
254 return UnicodeCategory.SpaceSeparator;
256 return UnicodeCategory.LineSeparator;
258 return UnicodeCategory.ParagraphSeparator;
260 return UnicodeCategory.Control;
262 return UnicodeCategory.Format;
264 return UnicodeCategory.Surrogate;
266 return UnicodeCategory.PrivateUse;
268 return UnicodeCategory.OtherNotAssigned;
270 throw new ArgumentException (String.Format ("Unexpected category {0}", s));
274 public class UnicodeDataCodeGeneratorC5_1_0
279 public UnicodeDataCodeGeneratorC5_1_0 (UnicodeData catalog, TextWriter writer)
281 this.catalog = catalog;
285 public void GenerateStructures ()
287 w.WriteLine ("/* ======== Structures ======== */");
288 w.WriteLine (@"typedef struct {
292 } SimpleTitlecaseMapping;");
293 w.WriteLine (@"typedef struct {
297 w.WriteLine (@"typedef struct {
300 } SimpleCaseMapping;");
303 void GenerateCodePointRanges (string name, CodePointRange [] ranges)
305 w.WriteLine ("static const guint8 {0}_count = {1};", name, ranges.Length);
306 w.WriteLine ("static const CodePointRange {0} [] = {{", name);
307 foreach (var cpr in ranges)
308 w.WriteLine ("{{0x{0:X06}, 0x{1:X06}}},", cpr.Start, cpr.End);
309 w.WriteLine ("{0, 0}};");
312 public void GenerateUnicodeCategoryListC (UcdCharacterProperty [] ucd)
314 w.WriteLine ("/* ======== Unicode Categories ======== */");
315 GenerateCodePointRanges ("unicode_category_ranges", catalog.CategoryRanges);
318 foreach (var cpr in catalog.CategoryRanges) {
319 w.WriteLine ("const GUnicodeType unicode_category_table{0} [] = {{", table);
320 w.WriteLine ("\t/* ==== {0:X}-{1:X} ==== */", cpr.Start, cpr.End);
323 foreach (var ucp in ucd) {
324 if (ucp.Codepoint >= cpr.End)
326 if (ucp.Codepoint < cp)
328 while (cp < ucp.Codepoint) {
331 // w.Write ("\n/* ==== {0:X} ==== */\n\t", cp);
332 w.Write ("\n\t", cp);
334 w.Write ((int) ToGUnicodeCategory (ucp.Category));
337 // w.Write ("\n/* ==== {0:X} ==== */\n\t", cp);
338 w.Write ("\n\t", cp);
346 w.WriteLine ("static const GUnicodeType *unicode_category [{0}] = {{", catalog.CategoryRanges.Length);
347 for (int i = 0, end = catalog.CategoryRanges.Length; i < end; i++)
348 w.WriteLine ("\tunicode_category_table{0}{1}", i, i + 1 < end ? "," : String.Empty);
352 public void GenerateSimpleTitlecaseMappingListC (UcdCharacterProperty [] ucd)
354 w.WriteLine ("static const SimpleTitlecaseMapping simple_titlecase_mapping [] = {");
356 foreach (var ucp in ucd) {
357 if (ucp.SimpleUppercaseMapping == ucp.SimpleTitlecaseMapping)
361 w.Write ("\t{{0x{0:X06}, 0x{1:X06}, 0x{2:X06}}}", ucp.Codepoint, ucp.SimpleUppercaseMapping, ucp.SimpleTitlecaseMapping);
366 w.WriteLine ("static const guint8 simple_titlecase_mapping_count = {0};", count);
369 public void GenerateSimpleCaseMappingListC (UcdCharacterProperty [] ucd)
371 GenerateCodePointRanges ("simple_case_map_ranges", catalog.SimpleCases);
372 GenerateSimpleCaseMappingListC (ucd, true, true);
373 GenerateSimpleCaseMappingListC (ucd, true, false);
374 GenerateSimpleCaseMappingListC (ucd, false, true);
375 GenerateSimpleCaseMappingListC (ucd, false, false);
378 void GenerateSimpleCaseMappingListC (UcdCharacterProperty [] ucd, bool upper, bool small)
381 foreach (var cpr in catalog.SimpleCases) {
382 if (small && cpr.Start > 0xFFFF)
384 if (!small && cpr.Start < 0x10000)
387 w.WriteLine ("static const {0} simple_{1}_case_mapping_{2}_table{3} [] = {{", small ? "guint16" : "guint32", upper ? "upper" : "lower", small ? "lowarea" : "higharea", nTable);
390 w.WriteLine ("\t/* ==== {0:X}-{1:X} ==== */", cpr.Start, cpr.End);
393 foreach (var ucp in ucd) {
394 if (ucp.Codepoint >= cpr.End)
396 if (ucp.Codepoint < cp)
398 while (cp < ucp.Codepoint) {
403 int v = upper ? ucp.SimpleUppercaseMapping : ucp.SimpleLowercaseMapping;
405 w.Write ("0x{0:X},", v);
409 if (++cp % 16 == 0) {
421 w.WriteLine ("static const {0} *simple_{1}_case_mapping_{2} [] = {{", small ? "guint16" : "guint32", upper ? "upper" : "lower", small ? "lowarea" : "higharea");
423 for (int i = 0; i < nTable; i++) {
426 w.Write ("\tstatic const guint8 simple_{0}_case_mapping_{1}_table{2}", upper ? "upper" : "lower", small ? "lowarea" : "higharea", i);
437 G_UNICODE_UNASSIGNED,
438 G_UNICODE_PRIVATE_USE,
440 G_UNICODE_LOWERCASE_LETTER,
441 G_UNICODE_MODIFIER_LETTER,
442 G_UNICODE_OTHER_LETTER,
443 G_UNICODE_TITLECASE_LETTER,
444 G_UNICODE_UPPERCASE_LETTER,
445 G_UNICODE_COMBINING_MARK,
446 G_UNICODE_ENCLOSING_MARK,
447 G_UNICODE_NON_SPACING_MARK,
448 G_UNICODE_DECIMAL_NUMBER,
449 G_UNICODE_LETTER_NUMBER,
450 G_UNICODE_OTHER_NUMBER,
451 G_UNICODE_CONNECT_PUNCTUATION,
452 G_UNICODE_DASH_PUNCTUATION,
453 G_UNICODE_CLOSE_PUNCTUATION,
454 G_UNICODE_FINAL_PUNCTUATION,
455 G_UNICODE_INITIAL_PUNCTUATION,
456 G_UNICODE_OTHER_PUNCTUATION,
457 G_UNICODE_OPEN_PUNCTUATION,
458 G_UNICODE_CURRENCY_SYMBOL,
459 G_UNICODE_MODIFIER_SYMBOL,
460 G_UNICODE_MATH_SYMBOL,
461 G_UNICODE_OTHER_SYMBOL,
462 G_UNICODE_LINE_SEPARATOR,
463 G_UNICODE_PARAGRAPH_SEPARATOR,
464 G_UNICODE_SPACE_SEPARATOR
467 GUnicodeType ToGUnicodeCategory (UnicodeCategory v)
470 case UnicodeCategory.UppercaseLetter:
471 return GUnicodeType.G_UNICODE_UPPERCASE_LETTER;
472 case UnicodeCategory.LowercaseLetter:
473 return GUnicodeType.G_UNICODE_LOWERCASE_LETTER;
474 case UnicodeCategory.TitlecaseLetter:
475 return GUnicodeType.G_UNICODE_TITLECASE_LETTER;
476 case UnicodeCategory.ModifierLetter:
477 return GUnicodeType.G_UNICODE_MODIFIER_LETTER;
478 case UnicodeCategory.OtherLetter:
479 return GUnicodeType.G_UNICODE_OTHER_LETTER;
480 case UnicodeCategory.NonSpacingMark:
481 return GUnicodeType.G_UNICODE_NON_SPACING_MARK;
482 case UnicodeCategory.SpacingCombiningMark:
483 return GUnicodeType.G_UNICODE_COMBINING_MARK;
484 case UnicodeCategory.EnclosingMark:
485 return GUnicodeType.G_UNICODE_ENCLOSING_MARK;
486 case UnicodeCategory.DecimalDigitNumber:
487 return GUnicodeType.G_UNICODE_DECIMAL_NUMBER;
488 case UnicodeCategory.LetterNumber:
489 return GUnicodeType.G_UNICODE_LETTER_NUMBER;
490 case UnicodeCategory.OtherNumber:
491 return GUnicodeType.G_UNICODE_OTHER_NUMBER;
492 case UnicodeCategory.ConnectorPunctuation:
493 return GUnicodeType.G_UNICODE_CONNECT_PUNCTUATION;
494 case UnicodeCategory.DashPunctuation:
495 return GUnicodeType.G_UNICODE_DASH_PUNCTUATION;
496 case UnicodeCategory.OpenPunctuation:
497 return GUnicodeType.G_UNICODE_OPEN_PUNCTUATION;
498 case UnicodeCategory.ClosePunctuation:
499 return GUnicodeType.G_UNICODE_CLOSE_PUNCTUATION;
500 case UnicodeCategory.InitialQuotePunctuation:
501 return GUnicodeType.G_UNICODE_INITIAL_PUNCTUATION;
502 case UnicodeCategory.FinalQuotePunctuation:
503 return GUnicodeType.G_UNICODE_FINAL_PUNCTUATION;
504 case UnicodeCategory.OtherPunctuation:
505 return GUnicodeType.G_UNICODE_OTHER_PUNCTUATION;
506 case UnicodeCategory.MathSymbol:
507 return GUnicodeType.G_UNICODE_MATH_SYMBOL;
508 case UnicodeCategory.CurrencySymbol:
509 return GUnicodeType.G_UNICODE_CURRENCY_SYMBOL;
510 case UnicodeCategory.ModifierSymbol:
511 return GUnicodeType.G_UNICODE_MODIFIER_SYMBOL;
512 case UnicodeCategory.OtherSymbol:
513 return GUnicodeType.G_UNICODE_OTHER_SYMBOL;
514 case UnicodeCategory.SpaceSeparator:
515 return GUnicodeType.G_UNICODE_SPACE_SEPARATOR;
516 case UnicodeCategory.LineSeparator:
517 return GUnicodeType.G_UNICODE_LINE_SEPARATOR;
518 case UnicodeCategory.ParagraphSeparator:
519 return GUnicodeType.G_UNICODE_PARAGRAPH_SEPARATOR;
520 case UnicodeCategory.Control:
521 return GUnicodeType.G_UNICODE_CONTROL;
522 case UnicodeCategory.Format:
523 return GUnicodeType.G_UNICODE_FORMAT;
524 case UnicodeCategory.Surrogate:
525 return GUnicodeType.G_UNICODE_SURROGATE;
526 case UnicodeCategory.PrivateUse:
527 return GUnicodeType.G_UNICODE_PRIVATE_USE;
528 case UnicodeCategory.OtherNotAssigned:
529 return GUnicodeType.G_UNICODE_UNASSIGNED;
531 throw new ArgumentException (String.Format ("Unexpected category {0}", v));
535 public class CodePointRange
537 public CodePointRange (int start, int end)
543 public int Start { get; set; }
544 public int End { get; set; }
547 public class UcdCharacterProperty
549 public int Codepoint { get; set; }
550 public string Name { get; set; }
551 public UnicodeCategory Category { get; set; }
552 public byte? CanonicalCombiningClass { get; set; }
553 public UcdBidiClass BidiClass { get; set; }
554 public UcdDecompositionType DecompositionType { get; set; }
555 public int [] DecompositionMapping { get; set; }
556 public string DecimalDigitValue { get; set; }
557 public string DigitValue { get; set; }
558 public string NumericValue { get; set; }
559 public bool BidiMirrored { get; set; }
560 public string Unicode1Name { get; set; }
561 public string IsoComment { get; set; }
562 public int SimpleUppercaseMapping { get; set; }
563 public int SimpleLowercaseMapping { get; set; }
564 public int SimpleTitlecaseMapping { get; set; }
567 public enum UcdBidiClass
591 public enum UcdDecompositionType