X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=blobdiff_plain;f=mcs%2Fclass%2Fcorlib%2FMono.Globalization.Unicode%2Fcreate-mscompat-collation-table.cs;h=02bee38d373c50a0221d9d970d25cbfcb98d3590;hb=bd9f9ee7cb81823608edc76ef9d0b6416783fe71;hp=0419dd25f48ec451a7cc6c14c40ff89b3303ffcf;hpb=b18f539bf9db0c6dce92b21be539edbfbe8a7c0d;p=mono.git diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs index 0419dd25f48..02bee38d373 100644 --- a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs +++ b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs @@ -1,4 +1,31 @@ // +// create-mscompat-collation-table.cs : generates Windows-like sortkey tables. +// +// Author: +// Atsushi Enomoto +// +// Copyright (C) 2005 Novell, Inc (http://www.novell.com) +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the +// "Software"), to deal in the Software without restriction, including +// without limitation the rights to use, copy, modify, merge, publish, +// distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to +// the following conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + // // There are two kind of sort keys : which are computed and which are laid out // as an indexed array. Computed sort keys are: @@ -6,24 +33,9 @@ // - Surrogate // - PrivateUse // -// Also, for composite characters it should prepare different index table. -// // Though it is possible to "compute" level 3 weights, they are still dumped // to an array to avoid execution cost. // - -// -// * sortkey getter signature -// -// int GetSortKey (string s, int index, SortKeyBuffer buf) -// Stores sort key for corresponding character element into buf and -// returns the length of the consumed _source_ character element in s. -// -// * character length to consume -// -// If there are characters whose primary weight is 0, they are consumed -// and considered as a part of the character element. -// #define Binary using System; @@ -33,6 +45,8 @@ using System.Globalization; using System.Text; using System.Xml; +using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil; + namespace Mono.Globalization.Unicode { internal class MSCompatSortKeyTableGenerator @@ -61,7 +75,8 @@ namespace Mono.Globalization.Unicode const int DecompositionCompat = 0x11; const int DecompositionCanonical = 0x12; - TextWriter Result = Console.Out; + TextWriter CSResult = Console.Out; + TextWriter CResult = TextWriter.Null; byte [] fillIndex = new byte [256]; // by category CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1]; @@ -97,7 +112,8 @@ namespace Mono.Globalization.Unicode string [] diacritics = new string [] { // LATIN, CYRILLIC etc. - "UPTURN", "DOUBLE-STRUCK", + "VERTICAL LINE ABOVE", "UPTURN", "DOUBLE-STRUCK", + "ABKHASIAN", "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS", "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;", "WITH ACUTE;", "WITH GRAVE;", @@ -106,7 +122,9 @@ namespace Mono.Globalization.Unicode "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;", "WITH DIALYTIKA;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;", - "DIALYTIKA TONOS", "DIALYTIKA AND TONOS", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;", + "DIALYTIKA TONOS", "DIALYTIKA AND TONOS", + "ABKHASIAN CHE WITH DESCENDER", + "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;", "WITH OGONEK;", "WITH CEDILLA;", // " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;", @@ -128,8 +146,8 @@ namespace Mono.Globalization.Unicode " BREVE AND TILDE", " CEDILLA AND BREVE", " OGONEK AND MACRON", - // - "WITH OVERLINE", + // 0x40 + "WITH OVERLINE", "DOUBLE VERTICAL LINE ABOVE", "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;", " DOUBLE GRAVE", " INVERTED BREVE", @@ -139,11 +157,12 @@ namespace Mono.Globalization.Unicode " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE", " PALATAL HOOK", " DOT BELOW;", - " RETROFLEX;", "DIAERESIS BELOW", - " RING BELOW", + " RETROFLEX;", "DIAERESIS BELOW", "RETROFLEX HOOK", + " RING BELOW", "LOW VERTICAL LINE", // " CIRCUMFLEX BELOW", "HORN AND ACUTE", " BREVE BELOW;", " HORN AND GRAVE", + " LOW MACRON", " TILDE BELOW", " TOPBAR", " DOT BELOW AND DOT ABOVE", @@ -161,12 +180,12 @@ namespace Mono.Globalization.Unicode }; byte [] diacriticWeights = new byte [] { // LATIN. - 3, 3, 5, 5, 5, + 3, 3, 3, 5, 5, 5, 5, 0xE, 0xF, 0xE, 0xF, // 0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, - 0x16, 0x17, 0x19, 0x1A, 0x1B, 0x1C, + 0x16, 0x17, 0x17, 0x19, 0x1A, 0x1B, 0x1C, // 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F, 0x20, 0x21, 0x22, 0x22, 0x23, 0x24, @@ -174,10 +193,11 @@ namespace Mono.Globalization.Unicode 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30, // - 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48, - 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A, + 0x40, 0x41, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48, + 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x59, + 0x5A, 0x5A, // - 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68, + 0x60, 0x60, 0x61, 0x61, 0x62, 0x63, 0x68, 0x68, 0x69, 0x69, 0x6A, 0x6D, 0x6E, 0x87, 0x95, 0xAA, // CIRCLED, PARENTHESIZED and so on. @@ -253,7 +273,9 @@ namespace Mono.Globalization.Unicode ModifyParsedValues (); GenerateCore (); Console.Error.WriteLine ("generation done."); + CResult = new StreamWriter ("collation-tables.h", false); Serialize (); + CResult.Close (); Console.Error.WriteLine ("serialization done."); /* StreamWriter sw = new StreamWriter ("agelog.txt"); @@ -284,6 +306,11 @@ sw.Close (); source, typeof (ushort), i); } + void WriteByte (byte value) + { + + } + void Serialize () { // Tailorings @@ -293,12 +320,16 @@ sw.Close (); byte [] level1 = new byte [map.Length]; byte [] level2 = new byte [map.Length]; byte [] level3 = new byte [map.Length]; - ushort [] widthCompat = new ushort [map.Length]; +// widthCompat is now removed from the mapping table. +// If it turned out that it is still required, grep this source and uncomment +// widthCompat related lines. FIXME: remove those lines in the future. +// ushort [] widthCompat = new ushort [map.Length]; for (int i = 0; i < map.Length; i++) { categories [i] = map [i].Category; level1 [i] = map [i].Level1; level2 [i] = map [i].Level2; level3 [i] = ComputeLevel3Weight ((char) i); +/* // For Japanese Half-width characters, don't // map widthCompat. It is IgnoreKanaType that // handles those width differences. @@ -313,158 +344,189 @@ sw.Close (); widthCompat [i] = (ushort) decompValues [decompIndex [i]]; break; } +*/ } // compress ignorableFlags = CompressArray (ignorableFlags, - MSCompatUnicodeTableUtil.Ignorable); - categories = CompressArray (categories, - MSCompatUnicodeTableUtil.Category); - level1 = CompressArray (level1, - MSCompatUnicodeTableUtil.Level1); - level2 = CompressArray (level2, - MSCompatUnicodeTableUtil.Level2); - level3 = CompressArray (level3, - MSCompatUnicodeTableUtil.Level3); - widthCompat = (ushort []) CodePointIndexer.CompressArray ( - widthCompat, typeof (ushort), - MSCompatUnicodeTableUtil.WidthCompat); - cjkCHS = CompressArray (cjkCHS, - MSCompatUnicodeTableUtil.CjkCHS); - cjkCHT = CompressArray (cjkCHT, - MSCompatUnicodeTableUtil.Cjk); - cjkJA = CompressArray (cjkJA, - MSCompatUnicodeTableUtil.Cjk); - cjkKO = CompressArray (cjkKO, - MSCompatUnicodeTableUtil.Cjk); - cjkKOlv2 = CompressArray (cjkKOlv2, - MSCompatUnicodeTableUtil.Cjk); + UUtil.Ignorable); + categories = CompressArray (categories, UUtil.Category); + level1 = CompressArray (level1, UUtil.Level1); + level2 = CompressArray (level2, UUtil.Level2); + level3 = CompressArray (level3, UUtil.Level3); +// widthCompat = (ushort []) CodePointIndexer.CompressArray ( +// widthCompat, typeof (ushort), UUtil.WidthCompat); + cjkCHS = CompressArray (cjkCHS, UUtil.CjkCHS); + cjkCHT = CompressArray (cjkCHT,UUtil.Cjk); + cjkJA = CompressArray (cjkJA, UUtil.Cjk); + cjkKO = CompressArray (cjkKO, UUtil.Cjk); + cjkKOlv2 = CompressArray (cjkKOlv2, UUtil.Cjk); // Ignorables - Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {"); + CResult.WriteLine ("static const guint8 collation_table_ignorableFlags [] = {"); + CSResult.WriteLine ("static readonly byte [] ignorableFlagsArr = new byte [] {"); #if Binary MemoryStream ms = new MemoryStream (); BinaryWriter binary = new BinaryWriter (ms); + binary.Write (UUtil.ResourceVersion); binary.Write (ignorableFlags.Length); #endif for (int i = 0; i < ignorableFlags.Length; i++) { byte value = ignorableFlags [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.Ignorable.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); // Primary category - Result.WriteLine ("internal static readonly byte [] categories = new byte [] {"); + CResult.WriteLine ("static const guint8 collation_table_category [] = {"); + CSResult.WriteLine ("static readonly byte [] categoriesArr = new byte [] {"); #if Binary binary.Write (categories.Length); #endif for (int i = 0; i < categories.Length; i++) { byte value = categories [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.Category.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); // Primary weight value - Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {"); + CResult.WriteLine ("static const guint8 collation_table_level1 [] = {"); + CSResult.WriteLine ("static readonly byte [] level1Arr = new byte [] {"); #if Binary binary.Write (level1.Length); #endif for (int i = 0; i < level1.Length; i++) { byte value = level1 [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.Level1.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); // Secondary weight - Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {"); + CResult.WriteLine ("static const guint8 collation_table_level2 [] = {"); + CSResult.WriteLine ("static readonly byte [] level2Arr = new byte [] {"); #if Binary binary.Write (level2.Length); #endif for (int i = 0; i < level2.Length; i++) { byte value = level2 [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.Level2.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); // Thirtiary weight - Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {"); + CResult.WriteLine ("static const guint8 collation_table_level3 [] = {"); + CSResult.WriteLine ("static readonly byte [] level3Arr = new byte [] {"); #if Binary binary.Write (level3.Length); #endif for (int i = 0; i < level3.Length; i++) { byte value = level3 [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.Level3.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); +/* // Width insensitivity mappings // (for now it is more lightweight than dumping the // entire NFKD table). - Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {"); + CResult.WriteLine ("static const guint16* widthCompat [] = {"); + CSResult.WriteLine ("static readonly ushort [] widthCompatArr = new ushort [] {"); #if Binary binary.Write (widthCompat.Length); #endif for (int i = 0; i < widthCompat.Length; i++) { ushort value = widthCompat [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.WidthCompat.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); +*/ + #if Binary - using (FileStream fs = File.Create ("../collation.core.bin")) { + using (FileStream fs = File.Create ("../resources/collation.core.bin")) { byte [] array = ms.ToArray (); fs.Write (array, 0, array.Length); } @@ -478,33 +540,70 @@ sw.Close (); SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0); } - void SerializeCJK (string name, ushort [] cjk, int max) + void SerializeCJK (string name, ushort [] cjk, int max_unused) { - int offset = 0;//char.MaxValue - cjk.Length; - Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name); +// CResult.WriteLine ("static const int collation_table_collation_cjk_{0}_size [] = {1};", name, cjk.Length); + CSResult.WriteLine ("const int {0}ArrLength = {1};", name, cjk.Length); + + int len = cjk.Length; + CResult.WriteLine ("static const guint8 collation_table_collation_cjk_{0} [] = {{", name); + CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name); + // the actual length is *2 + for (int i = 0; i < 4; i++, len /= 256) { + CResult.Write ("{0},", len & 0xFF); + CSResult.Write ("0x{0:X04},", len & 0xFF); + } + CResult.WriteLine (); + CSResult.WriteLine (); #if Binary MemoryStream ms = new MemoryStream (); BinaryWriter binary = new BinaryWriter (ms); - binary.Write (cjk.Length); + binary.Write (UUtil.ResourceVersion); + binary.Write (cjk.Length); // the actual size is *2. #endif + // category for (int i = 0; i < cjk.Length; i++) { - if (i + offset == max) - break; - ushort value = cjk [i]; +// if (i == max) +// break; + byte value = (byte) (cjk [i] >> 8); if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X04},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF + offset); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", i - 0xF); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + + // level 1 + for (int i = 0; i < cjk.Length; i++) { +// if (i == max) +// break; + byte value = (byte) (cjk [i] & 0xFF); + if (value < 10) + CSResult.Write ("{0},", value); + else + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary - using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) { + binary.Write (value); +#endif + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", i - 0xF); + CResult.WriteLine (); + } + } + + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); +#if Binary + using (FileStream fs = File.Create (String.Format ("../resources/collation.{0}.bin", name))) { byte [] array = ms.ToArray (); fs.Write (array, 0, array.Length); } @@ -513,30 +612,35 @@ sw.Close (); void SerializeCJK (string name, byte [] cjk, int max) { - int offset = 0;//char.MaxValue - cjk.Length; - Result.WriteLine ("static byte [] {0} = new byte [] {{", name); + CResult.WriteLine ("static const guint8 collation_table_collation_cjk_{0} [] = {{", name); + CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name); #if Binary MemoryStream ms = new MemoryStream (); BinaryWriter binary = new BinaryWriter (ms); + binary.Write (UUtil.ResourceVersion); #endif for (int i = 0; i < cjk.Length; i++) { - if (i + offset == max) + if (i == max) break; byte value = cjk [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF + offset); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", i - 0xF); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); #if Binary - using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) { + using (FileStream fs = File.Create (String.Format ("../resources/collation.{0}.bin", name))) { byte [] array = ms.ToArray (); fs.Write (array, 0, array.Length); } @@ -547,35 +651,46 @@ sw.Close (); { Hashtable indexes = new Hashtable (); Hashtable counts = new Hashtable (); - Result.WriteLine ("static char [] tailorings = new char [] {"); + CResult.WriteLine ("static const guint16 collation_table_tailoring [] = {"); + CSResult.WriteLine ("static char [] tailoringArr = new char [] {"); int count = 0; #if Binary MemoryStream ms = new MemoryStream (); BinaryWriter binary = new BinaryWriter (ms); + // Here we don't need to output resource version. + // This is cached. #endif foreach (Tailoring t in tailorings) { if (t.Alias != 0) continue; - Result.Write ("/*{0}*/", t.LCID); + CResult.Write ("/*{0}*/", t.LCID); + CSResult.Write ("/*{0}*/", t.LCID); indexes.Add (t.LCID, count); char [] values = t.ItemToCharArray (); counts.Add (t.LCID, values.Length); foreach (char c in values) { - Result.Write ("'\\x{0:X}', ", (int) c); - if (++count % 16 == 0) - Result.WriteLine (" // {0:X04}", count - 16); + CSResult.Write ("'\\x{0:X}', ", (int) c); + CResult.Write ("{0},", (int) c); + if (++count % 16 == 0) { + CSResult.WriteLine (" // {0:X04}", count - 16); + CResult.WriteLine (); + } #if Binary binary.Write ((ushort) c); #endif } } - Result.WriteLine ("};"); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); - Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {"); + CResult.WriteLine ("static const guint32 collation_table_tailoring_infos [] = {"); + CResult.WriteLine ("{0}, /*count*/", tailorings.Count); + CSResult.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {"); #if Binary byte [] rawdata = ms.ToArray (); ms = new MemoryStream (); binary = new BinaryWriter (ms); + binary.Write (UUtil.ResourceVersion); binary.Write (tailorings.Count); #endif foreach (Tailoring t in tailorings) { @@ -591,7 +706,8 @@ sw.Close (); foreach (Tailoring t2 in tailorings) if (t2.LCID == t.LCID) french = t2.FrenchSort; - Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false"); + CSResult.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false"); + CResult.WriteLine ("{0},{1},{2},{3},", t.LCID, idx, cnt, french ? 1 : 0); #if Binary binary.Write (t.LCID); binary.Write (idx); @@ -599,7 +715,8 @@ sw.Close (); binary.Write (french); #endif } - Result.WriteLine ("};"); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); #if Binary binary.Write ((byte) 0xFF); binary.Write ((byte) 0xFF); @@ -607,7 +724,7 @@ sw.Close (); binary.Write (rawdata, 0, rawdata.Length); - using (FileStream fs = File.Create ("../collation.tailoring.bin")) { + using (FileStream fs = File.Create ("../resources/collation.tailoring.bin")) { byte [] array = ms.ToArray (); fs.Write (array, 0, array.Length); } @@ -669,14 +786,17 @@ sw.Close (); { StringBuilder sb = new StringBuilder (); for (int i = 0; i < s.Length; i++) { - if (s.StartsWith ("\\u")) { - sb.Append ((char) int.Parse ( - s.Substring (2, 4), NumberStyles.HexNumber), + if (i + 5 < s.Length && + s [i] == '\\' && s [i + 1] == 'u') { + sb.Append ( + (char) int.Parse ( + s.Substring (i + 2, 4), + NumberStyles.HexNumber), 1); i += 5; } - else - sb.Append (s [i]); + else + sb.Append (s [i]); } return sb.ToString (); } @@ -856,10 +976,10 @@ sw.Close (); target = 'B'; else if (s.Substring (offset).StartsWith ("OPEN O")) target = 'C'; + else if (s.Substring (offset).StartsWith ("ETH")) + target = 'D'; else if (s.Substring (offset).StartsWith ("SCHWA")) target = 'E'; - else if (s.Substring (offset).StartsWith ("ENG")) - target = 'N'; else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3 target = 'O'; else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3 @@ -868,10 +988,15 @@ sw.Close (); target = 'S'; else if (s.Substring (offset).StartsWith ("ESH")) target = 'S'; + else if (s.Substring (offset).StartsWith ("OUNCE")) + target = 'Z'; // For remaining IPA chars, direct mapping is // much faster. switch (cp) { + case 0x0166: case 0x0167: + // Though they are 'T', they have different weight + target = char.MinValue; break; case 0x0299: target = 'B'; break; case 0x029A: target = 'E'; break; case 0x029B: target = 'G'; break; @@ -1098,12 +1223,12 @@ sw.Close (); // diacritical weights by character name if (diacritics.Length != diacriticWeights.Length) throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length)); - for (int d = 0; d < diacritics.Length; d++) { + for (int d = diacritics.Length - 1; d >= 0; d--) { if (s.IndexOf (diacritics [d]) > 0) { diacritical [cp] += diacriticWeights [d]; if (s.IndexOf ("COMBINING") >= 0) diacritical [cp] -= (byte) 2; - continue; + break; } // also process "COMBINING blah" here // For now it is limited to cp < 0x0370 @@ -1591,6 +1716,14 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la void ModifyUnidata () { + ArrayList decompValues = new ArrayList (this.decompValues); + + // Hebrew uppercase letters. + foreach (int i in new int [] + {0x05DB, 0x05DE, 0x05E0, 0x05E4, 0x05E6}) + isUppercase [i] = true; + + // Modify some decomposition equivalence for (int i = 0xFE31; i <= 0xFE34; i++) { decompType [i] = 0; @@ -1622,19 +1755,65 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la decompValues [decompIndex [0x3298]] = 0x52DE; // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things) - decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty) - decompValues [decompIndex [0xFA0C]] = 0x5140; + decompIndex [0xFA0C] = decompValues.Count; + decompValues.Add ((int) 0x5140); decompLength [0xFA0C] = 1; decompIndex [0xF929] = decompLength [0xF929] = 0; decompValues [decompIndex [0xF92C]] = 0x90DE; + + decompIndex [0x2125] = decompValues.Count; + decompValues.Add ((int) 0x005A); + decompLength [0x2125] = 1; + decompType [0x2125] = DecompositionFont; + + this.decompValues = decompValues.ToArray (typeof (int)) as int []; } void ModifyParsedValues () { + // Sometimes STROKE don't work fine + diacritical [0xD8] = diacritical [0xF8] = 0x21; + diacritical [0x141] = diacritical [0x142] = 0x1F; + // FIXME: why? + diacritical [0xAA] = diacritical [0xBA] = 3; + diacritical [0xD0] = diacritical [0xF0] = 0x68; + diacritical [0x131] = 3; + diacritical [0x138] = 3; + // TOPBAR does not work as an identifier for the weight + diacritical [0x182] = diacritical [0x183] = 0x68; // B + diacritical [0x18B] = diacritical [0x18C] = 0x1E; // D + // TONE TWO + diacritical [0x1A7] = diacritical [0x1A8] = 0x87; + // TONE SIX + diacritical [0x184] = diacritical [0x185] = 0x87; + // OPEN E + diacritical [0x190] = diacritical [0x25B] = 0x7B; + // There are many letters w/ diacritical weight 0x7B + diacritical [0x0192] = diacritical [0x0194] = + diacritical [0x0195] = diacritical [0x0196] = + diacritical [0x019C] = diacritical [0x019E] = + diacritical [0x01A6] = diacritical [0x01B1] = + diacritical [0x01B2] = diacritical [0x01BF] = 0x7B; + // ... as well as 0x7C + diacritical [0x01A2] = diacritical [0x01A3] = 0x7C; + + // NFKD characters seem to have diacritical + // weight as 3,4,5... but the order does not look + // by codepoint and I have no idea how they are sorted. + diacritical [0x210E] = 3; + diacritical [0x210F] = 0x68; + diacritical [0x2110] = 4; + diacritical [0x2111] = 5; + diacritical [0x2112] = 4; + diacritical [0x2113] = 4; + diacritical [0x211B] = 4; + diacritical [0x211C] = 5; + // some cyrillic diacritical weight. They seem to be // based on old character names, so it's quicker to // set them directly here. + // FIXME: they are by mostly unknown reason diacritical [0x0496] = diacritical [0x0497] = 7; diacritical [0x0498] = diacritical [0x0499] = 0x1A; diacritical [0x049A] = diacritical [0x049B] = 0x17; @@ -1643,6 +1822,15 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la diacritical [0x04A0] = diacritical [0x04A1] = 0xA; diacritical [0x04A2] = diacritical [0x04A3] = 7; diacritical [0x04A4] = diacritical [0x04A5] = 8; + diacritical [0x04AA] = diacritical [0x04AB] = 0x1A; // ES CEDILLA? + diacritical [0x04AC] = diacritical [0x04AD] = 7; // RIGHT DESCENDER? but U+4B2 + diacritical [0x04AE] = diacritical [0x04AF] = 0xB; // STRAIGHT U? + diacritical [0x04B2] = diacritical [0x04B3] = 0x17; // RIGHT DESCENDER? but U+4AC + diacritical [0x04B4] = diacritical [0x04B5] = 3; + diacritical [0x04B6] = 8; + diacritical [0x04B7] = 7; + diacritical [0x04B8] = diacritical [0x04B9] = 9; + diacritical [0x04BA] = diacritical [0x04BB] = 9; // number, secondary weights byte weight = 0x38; @@ -1652,6 +1840,13 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la if (Char.IsNumber ((char) cp)) diacritical [cp] = weight; + // Gurmukhi special letters' diacritical weight + for (int i = 0x0A50; i < 0x0A60; i++) + diacritical [i] = 4; + // Oriya special letters' diacritical weight + for (int i = 0x0B5C; i < 0x0B60; i++) + diacritical [i] = 6; + // Update name part of named characters for (int i = 0; i < sortableCharNames.Count; i++) { DictionaryEntry de = @@ -1757,7 +1952,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la fillIndex [6] = 0xA0; // vowels for (int i = 0x64B; i <= 0x650; i++) - AddArabicCharMap ((char) i); + AddArabicCharMap ((char) i, 6, 1, 0); // sukun AddCharMapGroup ('\u0652', 6, 1, 0); // shadda @@ -1803,11 +1998,12 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); + // FIXME: needs more love here (it should eliminate // all the hacky code above). for (int i = 0x0300; i < 0x0370; i++) if (!IsIgnorable (i) && diacritical [i] != 0 - /* especiall here*/ && !map [i].Defined) + && !map [i].Defined) map [i] = new CharMapEntry ( 0x1, 0x1, diacritical [i]); @@ -1855,9 +2051,15 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la fillIndex [0x1] = 0xEC; for (int i = 0x20DD; i <= 0x20E1; i++) AddCharMap ((char) i, 0x1, 1); - fillIndex [0x1] = 0x7; + fillIndex [0x1] = 0x4; + AddCharMap ('\u0CD5', 0x1, 1); + AddCharMap ('\u0CD6', 0x1, 1); + AddCharMap ('\u093C', 0x1, 1); for (int i = 0x302A; i <= 0x302D; i++) AddCharMap ((char) i, 0x1, 1); + AddCharMap ('\u0C55', 0x1, 1); + AddCharMap ('\u0C56', 0x1, 1); + fillIndex [0x1] = 0x50; // I wonder how they are sorted for (int i = 0x02D4; i <= 0x02D7; i++) AddCharMap ((char) i, 0x1, 1); @@ -2197,19 +2399,63 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la AddCharMapGroup ((char) i, 0xE, 1, 0); } - // Greek and Coptic - fillIndex [0xF] = 02; - for (int i = 0x0380; i < 0x0390; i++) + // IPA extensions + // FIXME: this results in not equivalent values to + // Windows, but is safer for comparison. + char [] ipaArray = new char [0x300 - 0x250 + 0x20]; + for (int i = 0x40; i < 0x60; i++) if (Char.IsLetter ((char) i)) - AddLetterMap ((char) i, 0xF, 1); - fillIndex [0xF] = 02; - for (int i = 0x0391; i < 0x03CF; i++) + ipaArray [i - 0x40] = (char) (i); + for (int i = 0x250; i < 0x300; i++) if (Char.IsLetter ((char) i)) - AddLetterMap ((char) i, 0xF, 1); + ipaArray [i - 0x250 + 0x20] = (char) i; + Array.Sort (ipaArray, UCAComparer.Instance); + int targetASCII = 0; + byte latinDiacritical = 0x7B; + foreach (char c in ipaArray) { + if (c <= 'Z') { + targetASCII = c; + latinDiacritical = 0x7B; + } + else + map [(int) c] = new CharMapEntry ( + 0xE, + map [targetASCII].Level1, + latinDiacritical++); + } + + // Greek and Coptic + + // FIXME: this is (mysterious and) incomplete. + for (int i = 0x0380; i < 0x0400; i++) + if (diacritical [i] == 0 && + decompLength [i] == 1 && + decompType [i] == DecompositionCompat) + diacritical [i] = 3; + + fillIndex [0xF] = 2; + for (int i = 0x0391; i < 0x03AA; i++) + if (i != 0x03A2) + AddCharMap ((char) i, 0xF, 1, + diacritical [i]); + fillIndex [0xF] = 2; + for (int i = 0x03B1; i < 0x03CA; i++) + if (i != 0x03C2) + AddCharMap ((char) i, 0xF, 1, + diacritical [i]); + // Final Sigma + map [0x03C2] = new CharMapEntry (0xF, + map [0x03C3].Level1, map [0x03C3].Level2); + fillIndex [0xF] = 0x40; - for (int i = 0x03D0; i < 0x0400; i++) - if (Char.IsLetter ((char) i)) - AddLetterMap ((char) i, 0xF, 1); + for (int i = 0x03DA; i < 0x03F0; i++) + AddCharMap ((char) i, 0xF, + (byte) (i % 2 == 0 ? 0 : 2), + diacritical [i]); + + // NFKD + for (int i = 0x0386; i <= 0x0400; i++) + FillLetterNFKD (i, true, true); // Cyrillic. // Cyrillic letters are sorted like Latin letters i.e. @@ -2282,6 +2528,10 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la } } + // NFKD + for (int i = 0x0401; i <= 0x045F; i++) + FillLetterNFKD (i, false, false); + for (int i = 0; i < cymap_src.Length; i++) { char c = cymap_src [i]; fillIndex [0x10] = map [c].Level1; @@ -2303,8 +2553,14 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // -Letters fillIndex [0x12] = 0x2; for (int i = 0x05D0; i < 0x05FF; i++) - if (Char.IsLetter ((char) i)) - AddLetterMap ((char) i, 0x12, 1); + if (Char.IsLetter ((char) i)) { + if (isUppercase [i]) { + fillIndex [0x12]--; + AddLetterMap ((char) i, 0x12, 2); + } + else + AddLetterMap ((char) i, 0x12, 1); + } // -Accents fillIndex [0x1] = 0x3; for (int i = 0x0591; i <= 0x05C2; i++) { @@ -2341,7 +2597,8 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la case 0x0649: formDiacritical = 5; break; case 0x064A: formDiacritical = 7; break; } - AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false); +// AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false); + AddArabicCharMap ((char) i, 0x13, 1, formDiacritical); } for (int i = 0x0670; i < 0x0673; i++) map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670)); @@ -2493,7 +2750,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la AddLetterMap ((char) i, 0x1, 1); continue; } - AddLetterMap ((char) i, 0x18, 1); + AddLetterMapCore ((char) i, 0x18, 1, 0, true); } // Tamil @@ -3046,18 +3303,22 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // Here Windows mapping is not straightforward. It is // not based on computation but seems manual sorting. AddCharMapGroup ('+', 0x8, 1, 0); // plus - AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus - AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus - AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul - AddCharMapGroup ('\u2044', 0x8, 1, 0); // div - AddCharMapGroup ('\u2215', 0x8, 1, 0); // div - AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul - AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring - AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet - AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus - AddCharMapGroup ('\u003C', 0x8, 1, 0); // < - AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation - AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation + AddCharMapGroup ('\u2212', 0x8, 1); // minus + AddCharMapGroup ('\u229D', 0x8, 1); // minus + AddCharMapGroup ('\u2297', 0x8, 1); // mul + AddCharMapGroup ('\u2044', 0x8, 1); // div + AddCharMapGroup ('\u2215', 0x8, 0); // div + AddCharMapGroup ('\u2298', 0x8, 1); // div slash + AddCharMapGroup ('\u2217', 0x8, 0); // mul + AddCharMapGroup ('\u229B', 0x8, 1); // asterisk oper + AddCharMapGroup ('\u2218', 0x8, 0); // ring + AddCharMapGroup ('\u229A', 0x8, 1); // ring + AddCharMapGroup ('\u2219', 0x8, 0); // bullet + AddCharMapGroup ('\u2299', 0x8, 1); // dot oper + AddCharMapGroup ('\u2213', 0x8, 1); // minus-or-plus + AddCharMapGroup ('\u003C', 0x8, 1); // < + AddCharMapGroup ('\u227A', 0x8, 1); // precedes relation + AddCharMapGroup ('\u22B0', 0x8, 1); // precedes under relation for (int cp = 0; cp < 0x2300; cp++) { if (cp == 0xAC) // SPECIAL CASE: skip @@ -3076,7 +3337,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // Char.GetUnicodeCategory ((char) cp) == // UnicodeCategory.MathSymbol) Char.IsSymbol ((char) cp)) - AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]); + AddCharMapGroup ((char) cp, 0x8, 1); // SPECIAL CASES: no idea why Windows sorts as such switch (cp) { case 0x3E: @@ -3084,10 +3345,10 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la AddCharMap ('\u22B1', 0x8, 1, 0); break; case 0xB1: - AddCharMapGroup ('\u00AB', 0x8, 1, 0); - AddCharMapGroup ('\u226A', 0x8, 1, 0); - AddCharMapGroup ('\u00BB', 0x8, 1, 0); - AddCharMapGroup ('\u226B', 0x8, 1, 0); + AddCharMapGroup ('\u00AB', 0x8, 1); + AddCharMapGroup ('\u226A', 0x8, 1); + AddCharMapGroup ('\u00BB', 0x8, 1); + AddCharMapGroup ('\u226B', 0x8, 1); break; case 0xF7: AddCharMap ('\u01C0', 0x8, 1, 0); @@ -3158,6 +3419,10 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la mod = diacritical [i]; break; case 0x13: // Arabic + if (i == 0x0621) + break; // 0 + if (diacritical [i] == 0 && decompLength [i] != 0) + diacritical [i] = map [decompValues [decompIndex [i]]].Level2; if (diacritical [i] == 0 && i >= 0xFE8D) mod = 0x8; // default for arabic break; @@ -3192,9 +3457,57 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la else AddCharMap ((char) i, 1, 1); } + #endregion } + TextInfo ti = CultureInfo.InvariantCulture.TextInfo; + + private void FillLetterNFKD (int i, bool checkUpper, bool greekRemap) + { + if (map [i].Defined) + return; + int up = (int) ti.ToUpper ((char) i); + if (checkUpper && map [up].Category == 0xF) { + if (i == up) + return; + FillLetterNFKD (up, checkUpper, greekRemap); + map [i] = new CharMapEntry (0xF, + map [up].Level1, + map [up].Level2); + } else { + int idx = decompIndex [i]; + if (idx == 0) + return; + int primary = decompValues [decompIndex [i]]; + FillLetterNFKD (primary, checkUpper, greekRemap); + + int lv2 = map [primary].Level2; + byte off = 0; + for (int l = 1; l < decompLength [i]; l++) { + int tmp = decompValues [idx + l]; + if (map [tmp].Category != 1) + return; + if (greekRemap && map [tmp].Level2 == 0xC) + off += 3; + else + off += map [tmp].Level2; + } + if (off > 0) { + if (lv2 == 0) + lv2 += 2; + lv2 += off; + } + // ... but override if the value already exists. + if (diacritical [i] != 0) + lv2 = diacritical [i]; + map [i] = new CharMapEntry ( + map [primary].Category, + map [primary].Level1, + (byte) lv2); + } + } + private void IncrementSequentialIndex (ref byte hangulCat) { fillIndex [hangulCat]++; @@ -3291,6 +3604,11 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la DecompositionWide, DecompositionNarrow, }; + private void AddCharMapGroup (char c, byte category, byte updateCount) + { + AddCharMapGroup (c, category, updateCount, 0, true); + } + private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2) { AddCharMapGroup (c, category, updateCount, level2, false); @@ -3470,12 +3788,8 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la } } - private void AddArabicCharMap (char c) + private void AddArabicCharMap (char c, byte category, byte updateCount, byte level2) { - byte category = 6; - byte updateCount = 1; - byte level2 = 0; - // itself AddCharMap (c, category, 0, level2); @@ -3577,18 +3891,28 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // Arabic if ('\u2135' <= c && c <= '\u2138') return 4; - if ('\uFE80' <= c && c < '\uFF00') { + // I believe that Windows has a bug on setting level 3 + // weight here. NFKD results in different values. + if ('\uFE80' < c && c < '\uFF00') { // 2(Isolated)/8(Final)/0x18(Medial) switch (decompType [(int) c]) { case DecompositionIsolated: - return 2; + return 0; // 2; case DecompositionFinal: return 8; case DecompositionMedial: return 0x18; + case DecompositionInitial: + return 0x10; } } + // I have no idea why those symbols have level 3 weight + if (c == '\u2104' || c == '\u212B') + return 0x18; + if ('\u211E' <= c && c <= '\u212B') + return 0x10; + // actually I dunno the reason why they have weights. switch (c) { case '\u01BC': @@ -3609,7 +3933,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la byte ret = 0; switch (c) { case '\u03C2': - case '\u2104': case '\u212B': ret = 8; break;