X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=blobdiff_plain;f=mcs%2Fclass%2Fcorlib%2FMono.Globalization.Unicode%2Fcreate-mscompat-collation-table.cs;h=02bee38d373c50a0221d9d970d25cbfcb98d3590;hb=bd9f9ee7cb81823608edc76ef9d0b6416783fe71;hp=6811b83855af244089bec580709762265900f4cd;hpb=6c5e0f97434a60a1a5b7785cb68f83b6b57010d7;p=mono.git diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs index 6811b83855a..02bee38d373 100644 --- a/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs +++ b/mcs/class/corlib/Mono.Globalization.Unicode/create-mscompat-collation-table.cs @@ -1,4 +1,31 @@ // +// create-mscompat-collation-table.cs : generates Windows-like sortkey tables. +// +// Author: +// Atsushi Enomoto +// +// Copyright (C) 2005 Novell, Inc (http://www.novell.com) +// +// Permission is hereby granted, free of charge, to any person obtaining +// a copy of this software and associated documentation files (the +// "Software"), to deal in the Software without restriction, including +// without limitation the rights to use, copy, modify, merge, publish, +// distribute, sublicense, and/or sell copies of the Software, and to +// permit persons to whom the Software is furnished to do so, subject to +// the following conditions: +// +// The above copyright notice and this permission notice shall be +// included in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +// + // // There are two kind of sort keys : which are computed and which are laid out // as an indexed array. Computed sort keys are: @@ -6,24 +33,9 @@ // - Surrogate // - PrivateUse // -// Also, for composite characters it should prepare different index table. -// // Though it is possible to "compute" level 3 weights, they are still dumped // to an array to avoid execution cost. // - -// -// * sortkey getter signature -// -// int GetSortKey (string s, int index, SortKeyBuffer buf) -// Stores sort key for corresponding character element into buf and -// returns the length of the consumed _source_ character element in s. -// -// * character length to consume -// -// If there are characters whose primary weight is 0, they are consumed -// and considered as a part of the character element. -// #define Binary using System; @@ -33,6 +45,8 @@ using System.Globalization; using System.Text; using System.Xml; +using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil; + namespace Mono.Globalization.Unicode { internal class MSCompatSortKeyTableGenerator @@ -61,7 +75,8 @@ namespace Mono.Globalization.Unicode const int DecompositionCompat = 0x11; const int DecompositionCanonical = 0x12; - TextWriter Result = Console.Out; + TextWriter CSResult = Console.Out; + TextWriter CResult = TextWriter.Null; byte [] fillIndex = new byte [256]; // by category CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1]; @@ -96,12 +111,20 @@ namespace Mono.Globalization.Unicode byte [] diacritical = new byte [char.MaxValue + 1]; string [] diacritics = new string [] { - // LATIN - "WITH VERTICAL LINE ABOVE;", - "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;", - "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;", - "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;", - " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;", + // LATIN, CYRILLIC etc. + "VERTICAL LINE ABOVE", "UPTURN", "DOUBLE-STRUCK", + "ABKHASIAN", + "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS", + "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;", + "WITH ACUTE;", "WITH GRAVE;", + // + "WITH DOT ABOVE;", " MIDDLE DOT;", + "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;", + "WITH DIALYTIKA;", + "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;", + "DIALYTIKA TONOS", "DIALYTIKA AND TONOS", + "ABKHASIAN CHE WITH DESCENDER", + "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;", "WITH OGONEK;", "WITH CEDILLA;", // " DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;", @@ -123,10 +146,10 @@ namespace Mono.Globalization.Unicode " BREVE AND TILDE", " CEDILLA AND BREVE", " OGONEK AND MACRON", - // - "WITH OVERLINE", + // 0x40 + "WITH OVERLINE", "DOUBLE VERTICAL LINE ABOVE", "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;", - " DOUBLE GRAVE;", + " DOUBLE GRAVE", " INVERTED BREVE", "ROMAN NUMERAL", " PRECEDED BY APOSTROPHE", @@ -134,11 +157,12 @@ namespace Mono.Globalization.Unicode " LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE", " PALATAL HOOK", " DOT BELOW;", - " RETROFLEX;", "DIAERESIS BELOW", - " RING BELOW", + " RETROFLEX;", "DIAERESIS BELOW", "RETROFLEX HOOK", + " RING BELOW", "LOW VERTICAL LINE", // " CIRCUMFLEX BELOW", "HORN AND ACUTE", " BREVE BELOW;", " HORN AND GRAVE", + " LOW MACRON", " TILDE BELOW", " TOPBAR", " DOT BELOW AND DOT ABOVE", @@ -146,6 +170,7 @@ namespace Mono.Globalization.Unicode " CIRCUMFLEX AND DOT BELOW", " BREVE AND DOT BELOW", " DOT BELOW AND MACRON", + " TONE TWO", " HORN AND HOOK ABOVE", " HORN AND DOT", // CIRCLED, PARENTHESIZED and so on @@ -155,10 +180,12 @@ namespace Mono.Globalization.Unicode }; byte [] diacriticWeights = new byte [] { // LATIN. - 5, - 0xF, 0xE, 0x12, - 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, - 0x17, 0x19, 0x1A, 0x1B, 0x1C, + 3, 3, 3, 5, 5, 5, 5, + 0xE, 0xF, + 0xE, 0xF, + // + 0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16, + 0x16, 0x17, 0x17, 0x19, 0x1A, 0x1B, 0x1C, // 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F, 0x20, 0x21, 0x22, 0x22, 0x23, 0x24, @@ -166,12 +193,13 @@ namespace Mono.Globalization.Unicode 0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30, // - 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48, - 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A, + 0x40, 0x41, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48, + 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x59, + 0x5A, 0x5A, // - 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68, + 0x60, 0x60, 0x61, 0x61, 0x62, 0x63, 0x68, 0x68, 0x69, 0x69, 0x6A, 0x6D, 0x6E, - 0x95, 0xAA, + 0x87, 0x95, 0xAA, // CIRCLED, PARENTHESIZED and so on. 0xEE, 0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3 @@ -185,7 +213,6 @@ namespace Mono.Globalization.Unicode 0xE50, 0xE60, 0xED0, 0xEE0 }; - char [] orderedCyrillic; char [] orderedGurmukhi; char [] orderedGujarati; char [] orderedGeorgian; @@ -212,11 +239,9 @@ namespace Mono.Globalization.Unicode // cp -> level1 value Hashtable arabicLetterPrimaryValues = new Hashtable (); - Hashtable cyrillicLetterPrimaryValues = new Hashtable (); // letterName -> cp Hashtable arabicNameMap = new Hashtable (); - Hashtable cyrillicNameMap = new Hashtable (); // cp -> Hashtable [decompType] -> cp Hashtable nfkdMap = new Hashtable (); @@ -248,7 +273,9 @@ namespace Mono.Globalization.Unicode ModifyParsedValues (); GenerateCore (); Console.Error.WriteLine ("generation done."); + CResult = new StreamWriter ("collation-tables.h", false); Serialize (); + CResult.Close (); Console.Error.WriteLine ("serialization done."); /* StreamWriter sw = new StreamWriter ("agelog.txt"); @@ -279,6 +306,11 @@ sw.Close (); source, typeof (ushort), i); } + void WriteByte (byte value) + { + + } + void Serialize () { // Tailorings @@ -288,12 +320,21 @@ sw.Close (); byte [] level1 = new byte [map.Length]; byte [] level2 = new byte [map.Length]; byte [] level3 = new byte [map.Length]; - ushort [] widthCompat = new ushort [map.Length]; +// widthCompat is now removed from the mapping table. +// If it turned out that it is still required, grep this source and uncomment +// widthCompat related lines. FIXME: remove those lines in the future. +// ushort [] widthCompat = new ushort [map.Length]; for (int i = 0; i < map.Length; i++) { categories [i] = map [i].Category; level1 [i] = map [i].Level1; level2 [i] = map [i].Level2; level3 [i] = ComputeLevel3Weight ((char) i); +/* + // For Japanese Half-width characters, don't + // map widthCompat. It is IgnoreKanaType that + // handles those width differences. + if (0xFF6D <= i && i <= 0xFF9D) + continue; switch (decompType [i]) { case DecompositionNarrow: case DecompositionWide: @@ -303,158 +344,189 @@ sw.Close (); widthCompat [i] = (ushort) decompValues [decompIndex [i]]; break; } +*/ } // compress ignorableFlags = CompressArray (ignorableFlags, - MSCompatUnicodeTableUtil.Ignorable); - categories = CompressArray (categories, - MSCompatUnicodeTableUtil.Category); - level1 = CompressArray (level1, - MSCompatUnicodeTableUtil.Level1); - level2 = CompressArray (level2, - MSCompatUnicodeTableUtil.Level2); - level3 = CompressArray (level3, - MSCompatUnicodeTableUtil.Level3); - widthCompat = (ushort []) CodePointIndexer.CompressArray ( - widthCompat, typeof (ushort), - MSCompatUnicodeTableUtil.WidthCompat); - cjkCHS = CompressArray (cjkCHS, - MSCompatUnicodeTableUtil.CjkCHS); - cjkCHT = CompressArray (cjkCHT, - MSCompatUnicodeTableUtil.Cjk); - cjkJA = CompressArray (cjkJA, - MSCompatUnicodeTableUtil.Cjk); - cjkKO = CompressArray (cjkKO, - MSCompatUnicodeTableUtil.Cjk); - cjkKOlv2 = CompressArray (cjkKOlv2, - MSCompatUnicodeTableUtil.Cjk); + UUtil.Ignorable); + categories = CompressArray (categories, UUtil.Category); + level1 = CompressArray (level1, UUtil.Level1); + level2 = CompressArray (level2, UUtil.Level2); + level3 = CompressArray (level3, UUtil.Level3); +// widthCompat = (ushort []) CodePointIndexer.CompressArray ( +// widthCompat, typeof (ushort), UUtil.WidthCompat); + cjkCHS = CompressArray (cjkCHS, UUtil.CjkCHS); + cjkCHT = CompressArray (cjkCHT,UUtil.Cjk); + cjkJA = CompressArray (cjkJA, UUtil.Cjk); + cjkKO = CompressArray (cjkKO, UUtil.Cjk); + cjkKOlv2 = CompressArray (cjkKOlv2, UUtil.Cjk); // Ignorables - Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {"); + CResult.WriteLine ("static const guint8 collation_table_ignorableFlags [] = {"); + CSResult.WriteLine ("static readonly byte [] ignorableFlagsArr = new byte [] {"); #if Binary MemoryStream ms = new MemoryStream (); BinaryWriter binary = new BinaryWriter (ms); + binary.Write (UUtil.ResourceVersion); binary.Write (ignorableFlags.Length); #endif for (int i = 0; i < ignorableFlags.Length; i++) { byte value = ignorableFlags [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.Ignorable.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); // Primary category - Result.WriteLine ("internal static readonly byte [] categories = new byte [] {"); + CResult.WriteLine ("static const guint8 collation_table_category [] = {"); + CSResult.WriteLine ("static readonly byte [] categoriesArr = new byte [] {"); #if Binary binary.Write (categories.Length); #endif for (int i = 0; i < categories.Length; i++) { byte value = categories [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.Category.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); // Primary weight value - Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {"); + CResult.WriteLine ("static const guint8 collation_table_level1 [] = {"); + CSResult.WriteLine ("static readonly byte [] level1Arr = new byte [] {"); #if Binary binary.Write (level1.Length); #endif for (int i = 0; i < level1.Length; i++) { byte value = level1 [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.Level1.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); // Secondary weight - Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {"); + CResult.WriteLine ("static const guint8 collation_table_level2 [] = {"); + CSResult.WriteLine ("static readonly byte [] level2Arr = new byte [] {"); #if Binary binary.Write (level2.Length); #endif for (int i = 0; i < level2.Length; i++) { byte value = level2 [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.Level2.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); // Thirtiary weight - Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {"); + CResult.WriteLine ("static const guint8 collation_table_level3 [] = {"); + CSResult.WriteLine ("static readonly byte [] level3Arr = new byte [] {"); #if Binary binary.Write (level3.Length); #endif for (int i = 0; i < level3.Length; i++) { byte value = level3 [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.Level3.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); +/* // Width insensitivity mappings // (for now it is more lightweight than dumping the // entire NFKD table). - Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {"); + CResult.WriteLine ("static const guint16* widthCompat [] = {"); + CSResult.WriteLine ("static readonly ushort [] widthCompatArr = new ushort [] {"); #if Binary binary.Write (widthCompat.Length); #endif for (int i = 0; i < widthCompat.Length; i++) { ushort value = widthCompat [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", + UUtil.WidthCompat.ToCodePoint (i - 0xF)); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); +*/ + #if Binary - using (FileStream fs = File.Create ("../collation.core.bin")) { + using (FileStream fs = File.Create ("../resources/collation.core.bin")) { byte [] array = ms.ToArray (); fs.Write (array, 0, array.Length); } @@ -468,32 +540,70 @@ sw.Close (); SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0); } - void SerializeCJK (string name, ushort [] cjk, int max) + void SerializeCJK (string name, ushort [] cjk, int max_unused) { - int offset = 0;//char.MaxValue - cjk.Length; - Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name); +// CResult.WriteLine ("static const int collation_table_collation_cjk_{0}_size [] = {1};", name, cjk.Length); + CSResult.WriteLine ("const int {0}ArrLength = {1};", name, cjk.Length); + + int len = cjk.Length; + CResult.WriteLine ("static const guint8 collation_table_collation_cjk_{0} [] = {{", name); + CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name); + // the actual length is *2 + for (int i = 0; i < 4; i++, len /= 256) { + CResult.Write ("{0},", len & 0xFF); + CSResult.Write ("0x{0:X04},", len & 0xFF); + } + CResult.WriteLine (); + CSResult.WriteLine (); #if Binary MemoryStream ms = new MemoryStream (); BinaryWriter binary = new BinaryWriter (ms); + binary.Write (UUtil.ResourceVersion); + binary.Write (cjk.Length); // the actual size is *2. #endif + // category for (int i = 0; i < cjk.Length; i++) { - if (i + offset == max) - break; - ushort value = cjk [i]; +// if (i == max) +// break; + byte value = (byte) (cjk [i] >> 8); if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X04},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF + offset); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", i - 0xF); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + + // level 1 + for (int i = 0; i < cjk.Length; i++) { +// if (i == max) +// break; + byte value = (byte) (cjk [i] & 0xFF); + if (value < 10) + CSResult.Write ("{0},", value); + else + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); +#if Binary + binary.Write (value); +#endif + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", i - 0xF); + CResult.WriteLine (); + } + } + + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); #if Binary - using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) { + using (FileStream fs = File.Create (String.Format ("../resources/collation.{0}.bin", name))) { byte [] array = ms.ToArray (); fs.Write (array, 0, array.Length); } @@ -502,30 +612,35 @@ sw.Close (); void SerializeCJK (string name, byte [] cjk, int max) { - int offset = 0;//char.MaxValue - cjk.Length; - Result.WriteLine ("static byte [] {0} = new byte [] {{", name); + CResult.WriteLine ("static const guint8 collation_table_collation_cjk_{0} [] = {{", name); + CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name); #if Binary MemoryStream ms = new MemoryStream (); BinaryWriter binary = new BinaryWriter (ms); + binary.Write (UUtil.ResourceVersion); #endif for (int i = 0; i < cjk.Length; i++) { - if (i + offset == max) + if (i == max) break; byte value = cjk [i]; if (value < 10) - Result.Write ("{0},", value); + CSResult.Write ("{0},", value); else - Result.Write ("0x{0:X02},", value); + CSResult.Write ("0x{0:X02},", value); + CResult.Write ("{0},", value); #if Binary binary.Write (value); #endif - if ((i & 0xF) == 0xF) - Result.WriteLine ("// {0:X04}", i - 0xF + offset); + if ((i & 0xF) == 0xF) { + CSResult.WriteLine ("// {0:X04}", i - 0xF); + CResult.WriteLine (); + } } - Result.WriteLine ("};"); - Result.WriteLine (); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); + CSResult.WriteLine (); #if Binary - using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) { + using (FileStream fs = File.Create (String.Format ("../resources/collation.{0}.bin", name))) { byte [] array = ms.ToArray (); fs.Write (array, 0, array.Length); } @@ -536,35 +651,46 @@ sw.Close (); { Hashtable indexes = new Hashtable (); Hashtable counts = new Hashtable (); - Result.WriteLine ("static char [] tailorings = new char [] {"); + CResult.WriteLine ("static const guint16 collation_table_tailoring [] = {"); + CSResult.WriteLine ("static char [] tailoringArr = new char [] {"); int count = 0; #if Binary MemoryStream ms = new MemoryStream (); BinaryWriter binary = new BinaryWriter (ms); + // Here we don't need to output resource version. + // This is cached. #endif foreach (Tailoring t in tailorings) { if (t.Alias != 0) continue; - Result.Write ("/*{0}*/", t.LCID); + CResult.Write ("/*{0}*/", t.LCID); + CSResult.Write ("/*{0}*/", t.LCID); indexes.Add (t.LCID, count); char [] values = t.ItemToCharArray (); counts.Add (t.LCID, values.Length); foreach (char c in values) { - Result.Write ("'\\x{0:X}', ", (int) c); - if (++count % 16 == 0) - Result.WriteLine (" // {0:X04}", count - 16); + CSResult.Write ("'\\x{0:X}', ", (int) c); + CResult.Write ("{0},", (int) c); + if (++count % 16 == 0) { + CSResult.WriteLine (" // {0:X04}", count - 16); + CResult.WriteLine (); + } #if Binary binary.Write ((ushort) c); #endif } } - Result.WriteLine ("};"); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); - Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {"); + CResult.WriteLine ("static const guint32 collation_table_tailoring_infos [] = {"); + CResult.WriteLine ("{0}, /*count*/", tailorings.Count); + CSResult.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {"); #if Binary byte [] rawdata = ms.ToArray (); ms = new MemoryStream (); binary = new BinaryWriter (ms); + binary.Write (UUtil.ResourceVersion); binary.Write (tailorings.Count); #endif foreach (Tailoring t in tailorings) { @@ -580,7 +706,8 @@ sw.Close (); foreach (Tailoring t2 in tailorings) if (t2.LCID == t.LCID) french = t2.FrenchSort; - Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false"); + CSResult.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false"); + CResult.WriteLine ("{0},{1},{2},{3},", t.LCID, idx, cnt, french ? 1 : 0); #if Binary binary.Write (t.LCID); binary.Write (idx); @@ -588,7 +715,8 @@ sw.Close (); binary.Write (french); #endif } - Result.WriteLine ("};"); + CResult.WriteLine ("0};"); + CSResult.WriteLine ("};"); #if Binary binary.Write ((byte) 0xFF); binary.Write ((byte) 0xFF); @@ -596,7 +724,7 @@ sw.Close (); binary.Write (rawdata, 0, rawdata.Length); - using (FileStream fs = File.Create ("../collation.tailoring.bin")) { + using (FileStream fs = File.Create ("../resources/collation.tailoring.bin")) { byte [] array = ms.ToArray (); fs.Write (array, 0, array.Length); } @@ -627,6 +755,7 @@ sw.Close (); ParseJISOrder (cp932); // in prior to ParseUnidata() ParseUnidata (unidata); + ModifyUnidata (); ParseDerivedCoreProperties (derivedCoreProps); ParseScripts (scripts); ParseCJK (chXML, jaXML, koXML); @@ -657,14 +786,17 @@ sw.Close (); { StringBuilder sb = new StringBuilder (); for (int i = 0; i < s.Length; i++) { - if (s.StartsWith ("\\u")) { - sb.Append ((char) int.Parse ( - s.Substring (2, 4), NumberStyles.HexNumber), + if (i + 5 < s.Length && + s [i] == '\\' && s [i + 1] == 'u') { + sb.Append ( + (char) int.Parse ( + s.Substring (i + 2, 4), + NumberStyles.HexNumber), 1); i += 5; } - else - sb.Append (s [i]); + else + sb.Append (s [i]); } return sb.ToString (); } @@ -844,10 +976,10 @@ sw.Close (); target = 'B'; else if (s.Substring (offset).StartsWith ("OPEN O")) target = 'C'; + else if (s.Substring (offset).StartsWith ("ETH")) + target = 'D'; else if (s.Substring (offset).StartsWith ("SCHWA")) target = 'E'; - else if (s.Substring (offset).StartsWith ("ENG")) - target = 'N'; else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3 target = 'O'; else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3 @@ -856,6 +988,26 @@ sw.Close (); target = 'S'; else if (s.Substring (offset).StartsWith ("ESH")) target = 'S'; + else if (s.Substring (offset).StartsWith ("OUNCE")) + target = 'Z'; + + // For remaining IPA chars, direct mapping is + // much faster. + switch (cp) { + case 0x0166: case 0x0167: + // Though they are 'T', they have different weight + target = char.MinValue; break; + case 0x0299: target = 'B'; break; + case 0x029A: target = 'E'; break; + case 0x029B: target = 'G'; break; + case 0x029C: target = 'H'; break; + case 0x029D: target = 'J'; break; + case 0x029E: target = 'K'; break; + case 0x029F: target = 'L'; break; + case 0x02A0: target = 'Q'; break; + case 0x02A7: target = 'T'; break; + case 0x02A8: target = 'T'; break; + } if (target == char.MinValue) target = previousLatinTarget; @@ -905,7 +1057,19 @@ sw.Close (); "SOUTH WEST", "LEFTWARDS", "NORTH WEST", + "LEFT RIGHT", + "UP DOWN", }; + if (s.IndexOf ("RIGHTWARDS") >= 0 && + s.IndexOf ("LEFTWARDS") >= 0) + value = 0xE1 - 0xD8; + else if (s.IndexOf ("UPWARDS") >= 0 && + s.IndexOf ("DOWNWARDS") >= 0) + value = 0xE2 - 0xD8; + else if (s.IndexOf ("ARROW") >= 0 && + s.IndexOf ("COMBINING") < 0 && + s.IndexOf ("CLOCKWISE") >= 0) + value = s.IndexOf ("ANTICLOCKWISE") >= 0 ? 0xE4 - 0xD8 : 0xE3 - 0xD8; if (value == 0) for (int i = 1; value == 0 && i < arrowTargets.Length; i++) if (s.IndexOf (arrowTargets [i]) > 0 && @@ -920,7 +1084,7 @@ sw.Close (); // Box names if (0x2500 <= cp && cp < 0x2600) { - int value = 0; + int value = int.MinValue; // flags: // up:1 down:2 right:4 left:8 vert:16 horiz:32 // [h,rl] [r] [l] @@ -960,7 +1124,8 @@ sw.Close (); flag |= 32; int fidx = flags.IndexOf (flag); - value = fidx < 0 ? fidx : offsets [fidx]; + if (fidx >= 0) + value = offsets [fidx]; } else if (s.IndexOf ("BLOCK") >= 0) { if (s.IndexOf ("ONE EIGHTH") >= 0) value = 0x12; @@ -1021,6 +1186,8 @@ sw.Close (); else value = 0xC9 - 0xE5; } + else if (s.IndexOf ("BULLET") >= 0) + value = 0xCC - 0xE5; if (0x25DA <= cp && cp <= 0x25E5) value = 0xCD + cp - 0x25DA - 0xE5; @@ -1030,7 +1197,7 @@ sw.Close (); case 0x2572: value = 0x10; break; case 0x2573: value = 0x11; break; } - if (value != 0) + if (value != int.MinValue) boxValues.Add (new DictionaryEntry ( cp, value)); } @@ -1045,15 +1212,23 @@ sw.Close (); sortableCharNames.Add (new DictionaryEntry ( cp, name.Substring (7))); + if (Char.GetUnicodeCategory ((char) cp) == + UnicodeCategory.MathSymbol) { + if (name.StartsWith ("CIRCLED ")) + diacritical [cp] = 0xEE; + if (name.StartsWith ("SQUARED ")) + diacritical [cp] = 0xEF; + } + // diacritical weights by character name if (diacritics.Length != diacriticWeights.Length) throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length)); - for (int d = 0; d < diacritics.Length; d++) { + for (int d = diacritics.Length - 1; d >= 0; d--) { if (s.IndexOf (diacritics [d]) > 0) { diacritical [cp] += diacriticWeights [d]; if (s.IndexOf ("COMBINING") >= 0) diacritical [cp] -= (byte) 2; - continue; + break; } // also process "COMBINING blah" here // For now it is limited to cp < 0x0370 @@ -1063,8 +1238,10 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la if (tmp.IndexOf ("WITH ") == 0) tmp = tmp.Substring (4); tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp); - if (name == tmp) + if (name == tmp) { diacritical [cp] = (byte) (diacriticWeights [d] - 2); + break; + } //if (name == tmp) //Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp); } @@ -1072,26 +1249,9 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la if (s.IndexOf ("FULL STOP") > 0 && (s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0)) diacritical [cp] |= 0xF4; - - // Cyrillic letter name - if (0x0430 <= cp && cp <= 0x0486 && - Char.IsLetter ((char) cp)) { - byte value = (byte) (cyrillicNameMap.Count * 3 + 0x06); - // Get primary letter name i.e. - // XXX part of CYRILLIC LETTER XXX yyy - // e.g. "IZHITSA" for "IZHITSA DOUBLE GRAVE". - string letterName = - name.Substring (name.IndexOf ("LETTER ") + 7); - int tmpIdx = letterName.IndexOf (' '); - letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx); -//Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName); - if (cyrillicNameMap.ContainsKey (letterName)) - value = (byte) cyrillicLetterPrimaryValues [cyrillicNameMap [letterName]]; - else - cyrillicNameMap [letterName] = cp; - - cyrillicLetterPrimaryValues [cp] = value; - } + if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0) + diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 : + s.IndexOf ("CAPITAL") > 0 ? 5 : 4); // Arabic letter name if (0x0621 <= cp && cp <= 0x064A && @@ -1308,7 +1468,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la void ParseScripts (string filename) { - ArrayList cyrillic = new ArrayList (); ArrayList gurmukhi = new ArrayList (); ArrayList gujarati = new ArrayList (); ArrayList georgian = new ArrayList (); @@ -1338,11 +1497,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la continue; switch (value) { - case "Cyrillic": - for (int x = cp; x <= cpEnd; x++) - if (!IsIgnorable (x)) - cyrillic.Add ((char) x); - break; case "Gurmukhi": for (int x = cp; x <= cpEnd; x++) if (!IsIgnorable (x)) @@ -1366,12 +1520,10 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la } } } - cyrillic.Sort (UCAComparer.Instance); gurmukhi.Sort (UCAComparer.Instance); gujarati.Sort (UCAComparer.Instance); georgian.Sort (UCAComparer.Instance); thaana.Sort (UCAComparer.Instance); - orderedCyrillic = (char []) cyrillic.ToArray (typeof (char)); orderedGurmukhi = (char []) gurmukhi.ToArray (typeof (char)); orderedGujarati = (char []) gujarati.ToArray (typeof (char)); orderedGeorgian = (char []) georgian.ToArray (typeof (char)); @@ -1458,16 +1610,55 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la category = "ja"; arr = cjkJA; offset = 0;//char.MaxValue - arr.Length; - doc.Load (jaXML); - s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText; + + // SPECIAL CASES + arr [0x4EDD] = 0x8002; // Chinese repetition mark? + arr [0x337B] = 0x8004; // Those 4 characters are Gengou + arr [0x337E] = 0x8005; + arr [0x337D] = 0x8006; + arr [0x337C] = 0x8007; + v = 0x8008; - foreach (char c in s) { + foreach (JISCharacter jc in jisJapanese) { + if (jc.JIS < 0x8800) + continue; + char c = (char) jc.CP; + if (c < '\u4E00') - Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v); + // Console.Error.WriteLine ("---- warning: for {0} {1:X04} is omitted which should be {2:X04}", category, (int) c, v); + continue; else { arr [(int) c - offset] = (ushort) v++; if (v % 256 == 0) v += 2; + + // SPECIAL CASES: + if (c == '\u662D') // U+337C + continue; + if (c == '\u5927') // U+337D + continue; + if (c == '\u5E73') // U+337B + continue; + if (c == '\u660E') // U+337E + continue; + if (c == '\u9686') // U+F9DC + continue; + + // FIXME: there are still remaining + // characters after U+FA0C. +// for (int k = 0; k < char.MaxValue; k++) { + for (int k = 0; k < '\uFA0D'; k++) { + if (decompIndex [k] == 0 || IsIgnorable (k)) + continue; + if (decompValues [decompIndex [k]] == c /*&& + decompLength [k] == 1*/ || + decompLength [k] == 3 && + decompValues [decompIndex [k] + 1] == c) { + arr [k - offset] = (ushort) v++; + if (v % 256 == 0) + v += 2; + } + } } } @@ -1523,8 +1714,124 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la } } + void ModifyUnidata () + { + ArrayList decompValues = new ArrayList (this.decompValues); + + // Hebrew uppercase letters. + foreach (int i in new int [] + {0x05DB, 0x05DE, 0x05E0, 0x05E4, 0x05E6}) + isUppercase [i] = true; + + + // Modify some decomposition equivalence + for (int i = 0xFE31; i <= 0xFE34; i++) { + decompType [i] = 0; + decompIndex [i] = 0; + decompLength [i] = 0; + } + decompType [0x037E] = 0; + decompIndex [0x037E] = 0; + decompLength [0x037E] = 0; + + // Hangzhou numbers + for (int i = 0x3021; i <= 0x3029; i++) + diacritical [i] = 0x4E; + // Korean parens numbers + for (int i = 0x3200; i <= 0x321C; i++) + diacritical [i] = 0xA; + for (int i = 0x3260; i <= 0x327B; i++) + diacritical [i] = 0xC; + + // LAMESPEC: these remapping should not be done. + // Windows have incorrect CJK compat mappings. + decompValues [decompIndex [0x32A9]] = 0x91AB; + decompLength [0x323B] = 1; + decompValues [decompIndex [0x323B]] = 0x5B78; + decompValues [decompIndex [0x32AB]] = 0x5B78; + decompValues [decompIndex [0x32A2]] = 0x5BEB; + decompLength [0x3238] = 1; + decompValues [decompIndex [0x3238]] = 0x52DE; + decompValues [decompIndex [0x3298]] = 0x52DE; + + // LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things) + decompIndex [0xFA0C] = decompValues.Count; + decompValues.Add ((int) 0x5140); + decompLength [0xFA0C] = 1; + decompIndex [0xF929] = decompLength [0xF929] = 0; + + decompValues [decompIndex [0xF92C]] = 0x90DE; + + decompIndex [0x2125] = decompValues.Count; + decompValues.Add ((int) 0x005A); + decompLength [0x2125] = 1; + decompType [0x2125] = DecompositionFont; + + this.decompValues = decompValues.ToArray (typeof (int)) as int []; + } + void ModifyParsedValues () { + // Sometimes STROKE don't work fine + diacritical [0xD8] = diacritical [0xF8] = 0x21; + diacritical [0x141] = diacritical [0x142] = 0x1F; + // FIXME: why? + diacritical [0xAA] = diacritical [0xBA] = 3; + diacritical [0xD0] = diacritical [0xF0] = 0x68; + diacritical [0x131] = 3; + diacritical [0x138] = 3; + // TOPBAR does not work as an identifier for the weight + diacritical [0x182] = diacritical [0x183] = 0x68; // B + diacritical [0x18B] = diacritical [0x18C] = 0x1E; // D + // TONE TWO + diacritical [0x1A7] = diacritical [0x1A8] = 0x87; + // TONE SIX + diacritical [0x184] = diacritical [0x185] = 0x87; + // OPEN E + diacritical [0x190] = diacritical [0x25B] = 0x7B; + // There are many letters w/ diacritical weight 0x7B + diacritical [0x0192] = diacritical [0x0194] = + diacritical [0x0195] = diacritical [0x0196] = + diacritical [0x019C] = diacritical [0x019E] = + diacritical [0x01A6] = diacritical [0x01B1] = + diacritical [0x01B2] = diacritical [0x01BF] = 0x7B; + // ... as well as 0x7C + diacritical [0x01A2] = diacritical [0x01A3] = 0x7C; + + // NFKD characters seem to have diacritical + // weight as 3,4,5... but the order does not look + // by codepoint and I have no idea how they are sorted. + diacritical [0x210E] = 3; + diacritical [0x210F] = 0x68; + diacritical [0x2110] = 4; + diacritical [0x2111] = 5; + diacritical [0x2112] = 4; + diacritical [0x2113] = 4; + diacritical [0x211B] = 4; + diacritical [0x211C] = 5; + + // some cyrillic diacritical weight. They seem to be + // based on old character names, so it's quicker to + // set them directly here. + // FIXME: they are by mostly unknown reason + diacritical [0x0496] = diacritical [0x0497] = 7; + diacritical [0x0498] = diacritical [0x0499] = 0x1A; + diacritical [0x049A] = diacritical [0x049B] = 0x17; + diacritical [0x049C] = diacritical [0x049D] = 9; + diacritical [0x049E] = diacritical [0x049F] = 4; + diacritical [0x04A0] = diacritical [0x04A1] = 0xA; + diacritical [0x04A2] = diacritical [0x04A3] = 7; + diacritical [0x04A4] = diacritical [0x04A5] = 8; + diacritical [0x04AA] = diacritical [0x04AB] = 0x1A; // ES CEDILLA? + diacritical [0x04AC] = diacritical [0x04AD] = 7; // RIGHT DESCENDER? but U+4B2 + diacritical [0x04AE] = diacritical [0x04AF] = 0xB; // STRAIGHT U? + diacritical [0x04B2] = diacritical [0x04B3] = 0x17; // RIGHT DESCENDER? but U+4AC + diacritical [0x04B4] = diacritical [0x04B5] = 3; + diacritical [0x04B6] = 8; + diacritical [0x04B7] = 7; + diacritical [0x04B8] = diacritical [0x04B9] = 9; + diacritical [0x04BA] = diacritical [0x04BB] = 9; + // number, secondary weights byte weight = 0x38; int [] numarr = numberSecondaryWeightBounds; @@ -1533,19 +1840,12 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la if (Char.IsNumber ((char) cp)) diacritical [cp] = weight; - // Modify some decomposition equivalence - decompType [0xFE31] = 0; - decompIndex [0xFE31] = 0; - decompLength [0xFE31] = 0; - decompType [0xFE32] = 0; - decompIndex [0xFE32] = 0; - decompLength [0xFE32] = 0; - - // Korean parens numbers - for (int i = 0x3200; i <= 0x321C; i++) - diacritical [i] = 0xA; - for (int i = 0x3260; i <= 0x327B; i++) - diacritical [i] = 0xC; + // Gurmukhi special letters' diacritical weight + for (int i = 0x0A50; i < 0x0A60; i++) + diacritical [i] = 4; + // Oriya special letters' diacritical weight + for (int i = 0x0B5C; i < 0x0B60; i++) + diacritical [i] = 6; // Update name part of named characters for (int i = 0; i < sortableCharNames.Count; i++) { @@ -1587,14 +1887,25 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la #region Specially ignored // 01 // This will raise "Defined" flag up. + // FIXME: Check If it is really fine. Actually for + // Japanese voice marks this code does remapping. foreach (char c in specialIgnore) map [(int) c] = new CharMapEntry (0, 0, 0); #endregion + #region Extenders (FF FF) + fillIndex [0xFF] = 0xFF; + char [] specialBiggest = new char [] { + '\u3005', '\u3031', '\u3032', '\u309D', + '\u309E', '\u30FC', '\u30FD', '\u30FE', + '\uFE7C', '\uFE7D', '\uFF70'}; + foreach (char c in specialBiggest) + AddCharMap (c, 0xFF, 0); + #endregion #region Variable weights // Controls : 06 03 - 06 3D - fillIndex [6] = 3; + fillIndex [0x6] = 3; for (int i = 0; i < 65536; i++) { if (IsIgnorable (i)) continue; @@ -1607,10 +1918,15 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la } // Apostrophe 06 80 - fillIndex [6] = 0x80; - AddCharMapGroup ('\'', 6, 1, 0); + fillIndex [0x6] = 0x80; + AddCharMap ('\'', 6, 0); + AddCharMap ('\uFF07', 6, 1); AddCharMap ('\uFE63', 6, 1); + // SPECIAL CASE: fill FE32 here in prior to be added + // at 2013. Windows does not always respect NFKD. + map [0xFE32] = new CharMapEntry (6, 0x90, 0); + // Hyphen/Dash : 06 81 - 06 90 for (int i = 0; i < char.MaxValue; i++) { if (!IsIgnorable (i) && @@ -1627,12 +1943,16 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la } } } + // They are regarded as primarily equivalent to '-' + map [0x208B] = new CharMapEntry (6, 0x82, 0); + map [0x207B] = new CharMapEntry (6, 0x82, 0); + map [0xFF0D] = new CharMapEntry (6, 0x82, 0); // Arabic variable weight chars 06 A0 - fillIndex [6] = 0xA0; // vowels for (int i = 0x64B; i <= 0x650; i++) - AddArabicCharMap ((char) i); + AddArabicCharMap ((char) i, 6, 1, 0); // sukun AddCharMapGroup ('\u0652', 6, 1, 0); // shadda @@ -1652,10 +1972,11 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la for (int i = 0x0329; i <= 0x0334; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); + fillIndex [0x1]++; for (int i = 0x0339; i <= 0x0341; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); - fillIndex [0x1] = 0x72; + fillIndex [0x1] = 0x74; for (int i = 0x0346; i <= 0x0348; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); @@ -1668,6 +1989,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la for (int i = 0x02CE; i <= 0x02CF; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); + fillIndex [0x1]++; for (int i = 0x02D1; i <= 0x02D3; i++) if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); @@ -1676,30 +1998,87 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la if (!IsIgnorable (i)) AddCharMap ((char) i, 0x1, 1); + // FIXME: needs more love here (it should eliminate // all the hacky code above). for (int i = 0x0300; i < 0x0370; i++) if (!IsIgnorable (i) && diacritical [i] != 0 - /* especiall here*/ && !map [i].Defined) + && !map [i].Defined) map [i] = new CharMapEntry ( 0x1, 0x1, diacritical [i]); - fillIndex [0x1] = 0xAC; - for (int i = 0x07A6; i <= 0x07B0; i++) - if (!IsIgnorable (i)) - AddCharMap ((char) i, 0x1, 1); + // Cyrillic and Armenian nonspacing mark + fillIndex [0x1] = 0x94; + for (int i = 0x400; i < 0x580; i++) + if (!IsIgnorable (i) && + Char.GetUnicodeCategory ((char) i) == + UnicodeCategory.NonSpacingMark) + AddCharMap ((char) i, 1, 1); - fillIndex [0x1] = 0x0C; - for (int i = 0x0EC8; i <= 0x0ECD; i++) - if (!IsIgnorable (i)) + fillIndex [0x1] = 0x8D; + // syriac dotted nonspacing marks (1) + AddCharMap ('\u0740', 0x1, 1); + AddCharMap ('\u0741', 0x1, 1); + AddCharMap ('\u0742', 0x1, 1); + // syriac oblique nonspacing marks + AddCharMap ('\u0747', 0x1, 1); + AddCharMap ('\u0748', 0x1, 1); + // syriac dotted nonspacing marks (2) + fillIndex [0x1] = 0x94; // this reset is mandatory + AddCharMap ('\u0732', 0x1, 1); + AddCharMap ('\u0735', 0x1, 1); + AddCharMap ('\u0738', 0x1, 1); + AddCharMap ('\u0739', 0x1, 1); + AddCharMap ('\u073C', 0x1, 1); + // SPECIAL CASES: superscripts + AddCharMap ('\u073F', 0x1, 1); + AddCharMap ('\u0711', 0x1, 1); + // syriac "DOTS" + for (int i = 0x0743; i <= 0x0746; i++) + AddCharMap ((char) i, 0x1, 1); + for (int i = 0x0730; i <= 0x0780; i++) + if (!map [i].Defined && + Char.GetUnicodeCategory ((char) i) == + UnicodeCategory.NonSpacingMark) AddCharMap ((char) i, 0x1, 1); // LAMESPEC: It should not stop at '\u20E1'. There are // a few more characters (that however results in // overflow of level 2 unless we start before 0xDD). - fillIndex [0x1] = 0xDC; - for (int i = 0x20d0; i <= 0x20e1; i++) + fillIndex [0x1] = 0xDD; + for (int i = 0x20D0; i <= 0x20DC; i++) + AddCharMap ((char) i, 0x1, 1); + fillIndex [0x1] = 0xEC; + for (int i = 0x20DD; i <= 0x20E1; i++) + AddCharMap ((char) i, 0x1, 1); + fillIndex [0x1] = 0x4; + AddCharMap ('\u0CD5', 0x1, 1); + AddCharMap ('\u0CD6', 0x1, 1); + AddCharMap ('\u093C', 0x1, 1); + for (int i = 0x302A; i <= 0x302D; i++) + AddCharMap ((char) i, 0x1, 1); + AddCharMap ('\u0C55', 0x1, 1); + AddCharMap ('\u0C56', 0x1, 1); + + fillIndex [0x1] = 0x50; // I wonder how they are sorted + for (int i = 0x02D4; i <= 0x02D7; i++) AddCharMap ((char) i, 0x1, 1); + + // They are not part of Nonspacing marks, but have + // only diacritical weight. + for (int i = 0x3099; i <= 0x309C; i++) + map [i] = new CharMapEntry (1, 1, 1); + map [0xFF9E] = new CharMapEntry (1, 1, 1); + map [0xFF9F] = new CharMapEntry (1, 1, 2); + map [0x309D] = new CharMapEntry (0xFF, 0xFF, 1); + map [0x309E] = new CharMapEntry (0xFF, 0xFF, 1); + for (int i = 0x30FC; i <= 0x30FE; i++) + map [i] = new CharMapEntry (0xFF, 0xFF, 1); + + fillIndex [0x1] = 0xA; + for (int i = 0x0951; i <= 0x0954; i++) + AddCharMap ((char) i, 0x1, 2); + #endregion @@ -1722,6 +2101,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la // while they aren't. AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol AddCharMap ('\u2423', 0x7, 1, 0); // open box + #endregion // category 09 - continued symbols from 08 @@ -1731,7 +2111,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la AddCharMap ((char) cp, 0x9, 1, 0); // arrows - byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3}; + byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}; foreach (DictionaryEntry de in arrowValues) { int idx = (int) de.Value; int cp = (int) de.Key; @@ -1743,6 +2123,8 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la } // boxes byte [] boxLv2 = new byte [128]; + // 0-63 will be used for those offsets are positive, + // and 64-127 are for negative ones. for (int i = 0; i < boxLv2.Length; i++) boxLv2 [i] = 3; foreach (DictionaryEntry de in boxValues) { @@ -1752,7 +2134,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la continue; if (off < 0) { fillIndex [0x9] = (byte) (0xE5 + off); - AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++); + AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [128 + off]++); } else { fillIndex [0x9] = (byte) (0xE5 + off); @@ -1773,8 +2155,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la uc = Char.GetUnicodeCategory ((char) cp); if (!IsIgnorable (cp) && uc == UnicodeCategory.CurrencySymbol && - cp != '$' || - cp == 0xAC) + cp != '$') AddCharMapGroup ((char) cp, 0xA, 1, 0); } // byte other symbols @@ -1783,14 +2164,33 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la continue; // SPECIAL: skip FIXME: why? uc = Char.GetUnicodeCategory ((char) cp); if (!IsIgnorable (cp) && - uc == UnicodeCategory.OtherSymbol) + uc == UnicodeCategory.OtherSymbol || + cp == '\u00AC' || cp == '\u00B5' || cp == '\u00B7') AddCharMapGroup ((char) cp, 0xA, 1, 0); } + // U+30FB here + AddCharMapGroup ('\u30FB', 0xA, 1, 0); + + for (int cp = 0x2020; cp <= 0x2031; cp++) + if (Char.IsPunctuation ((char) cp)) + AddCharMap ((char) cp, 0xA, 1, 0); + // SPECIAL CASES: why? + AddCharMap ('\u203B', 0xA, 1, 0); + AddCharMap ('\u2040', 0xA, 1, 0); + AddCharMap ('\u2041', 0xA, 1, 0); + AddCharMap ('\u2042', 0xA, 1, 0); - fillIndex [0xA] = 0x1C; // FIXME: it won't be needed for (int cp = 0x20A0; cp <= 0x20AB; cp++) AddCharMap ((char) cp, 0xA, 1, 0); - fillIndex [0xA] = 0x2F; // FIXME: it won't be needed + + // 3004 is skipped at first... + for (int cp = 0x3010; cp <= 0x3040; cp++) + if (Char.IsSymbol ((char) cp)) + AddCharMap ((char) cp, 0xA, 1, 0); + // SPECIAL CASES: added here + AddCharMap ('\u3004', 0xA, 1, 0); + AddCharMap ('\u327F', 0xA, 1, 0); + for (int cp = 0x2600; cp <= 0x2613; cp++) AddCharMap ((char) cp, 0xA, 1, 0); // Dingbats @@ -1801,13 +2201,17 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la for (int i = 0x2440; i < 0x2460; i++) AddCharMap ((char) i, 0xA, 1, 0); + // SPECIAL CASES: why? + AddCharMap ('\u0E3F', 0xA, 1, 0); + AddCharMap ('\u2117', 0xA, 1, 0); + AddCharMap ('\u20AC', 0xA, 1, 0); #endregion #region Numbers // 0C 02 - 0C E1 fillIndex [0xC] = 2; // 9F8 : Bengali "one less than the denominator" - AddCharMap ('\u09F8', 0xC, 1); + AddCharMap ('\u09F8', 0xC, 1, 0x3C); ArrayList numbers = new ArrayList (); for (int i = 0; i < 65536; i++) @@ -1819,11 +2223,15 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la ArrayList numberValues = new ArrayList (); foreach (int i in numbers) numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i])); + // SPECIAL CASE: Cyrillic Thousand sign + numberValues.Add (new DictionaryEntry (0x0482, 1000m)); numberValues.Sort (DecimalDictionaryValueComparer.Instance); //foreach (DictionaryEntry de in numberValues) //Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]); + // FIXME: fillIndex adjustment lines are too + // complicated. It must be simpler. decimal prevValue = -1; foreach (DictionaryEntry de in numberValues) { int cp = (int) de.Key; @@ -1841,18 +2249,25 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la fillIndex [0xC]++; int xcp; - if (currValue <= 10) { - xcp = (int) prevValue + 0x2170 - 1; - AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); + if (currValue <= 13) { + if (currValue == 4) + fillIndex [0xC]++; + // SPECIAL CASE + if (currValue == 11) + AddCharMap ('\u0BF0', 0xC, 1); xcp = (int) prevValue + 0x2160 - 1; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); - fillIndex [0xC] += 2; - xcp = (int) prevValue + 0x3021 - 1; + xcp = (int) prevValue + 0x2170 - 1; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); fillIndex [0xC]++; } - else if (currValue == 11) + if (currValue < 12) + fillIndex [0xC]++; + if (currValue <= 10) { + xcp = (int) prevValue + 0x3021 - 1; + AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); fillIndex [0xC]++; + } } if (prevValue < currValue) prevValue = currValue; @@ -1860,20 +2275,19 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la continue; // HangZhou and Roman are add later // (code is above) - else if (0x3021 <= cp && cp < 0x302A - || 0x2160 <= cp && cp < 0x216A - || 0x2170 <= cp && cp < 0x217A) + if (0x3021 <= cp && cp < 0x302A + || 0x2160 <= cp && cp < 0x216C + || 0x2170 <= cp && cp < 0x217C) continue; - if (cp == 0x215B) // FIXME: why? + if (cp == 0x215B) // FIXME: why? fillIndex [0xC] += 2; else if (cp == 0x3021) // FIXME: why? fillIndex [0xC]++; - AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]); if (addnew || cp <= '9') { int mod = (int) currValue - 1; int xcp; - if (1 <= currValue && currValue <= 10) { + if (1 <= currValue && currValue <= 11) { xcp = mod + 0x2776; AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); xcp = mod + 0x2780; @@ -1890,9 +2304,27 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]); } } + if (addnew && currValue >= 10 && currValue < 13 || cp == 0x09F9) + fillIndex [0xC]++; + AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true); - if (cp != 0x09E7 && cp != 0x09EA) + switch (cp) { + // Maybe Bengali digit numbers do not increase + // indexes, but 0x09E6 does. + case 0x09E7: case 0x09E8: case 0x09E9: + case 0x09EA: + // SPECIAL CASES + case 0x0BF0: case 0x2180: case 0x2181: + break; + // SPECIAL CASE + case 0x0BF1: fillIndex [0xC]++; + break; + default: + if (currValue < 11 || currValue == 1000) + fillIndex [0xC]++; + break; + } // Add special cases that are not regarded as // numbers in UnicodeCategory speak. @@ -1901,7 +2333,7 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la AddCharMapGroup ('\u01BD', 0xC, 0, 0); AddCharMapGroup ('\u01BC', 0xC, 1, 0); } - else if (cp == '6') // FIXME: why? + else if (cp == '2' || cp == '6') // FIXME: why? fillIndex [0xC]++; } @@ -1916,7 +2348,6 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la for (int i = 0; i < alphabets.Length; i++) AddAlphaMap (alphabets [i], 0xE, alphaWeights [i]); - // non-ASCII Latin alphabets // FIXME: there is no such characters that are placed // *after* "alphabets" array items. This is nothing @@ -1968,77 +2399,176 @@ throw new Exception (String.Format ("Should not happen. weights are {0} while la AddCharMapGroup ((char) i, 0xE, 1, 0); } - // Greek and Coptic - fillIndex [0xF] = 02; - for (int i = 0x0380; i < 0x0390; i++) + // IPA extensions + // FIXME: this results in not equivalent values to + // Windows, but is safer for comparison. + char [] ipaArray = new char [0x300 - 0x250 + 0x20]; + for (int i = 0x40; i < 0x60; i++) if (Char.IsLetter ((char) i)) - AddLetterMap ((char) i, 0xF, 1); - fillIndex [0xF] = 02; - for (int i = 0x0391; i < 0x03CF; i++) + ipaArray [i - 0x40] = (char) (i); + for (int i = 0x250; i < 0x300; i++) if (Char.IsLetter ((char) i)) - AddLetterMap ((char) i, 0xF, 1); + ipaArray [i - 0x250 + 0x20] = (char) i; + Array.Sort (ipaArray, UCAComparer.Instance); + int targetASCII = 0; + byte latinDiacritical = 0x7B; + foreach (char c in ipaArray) { + if (c <= 'Z') { + targetASCII = c; + latinDiacritical = 0x7B; + } + else + map [(int) c] = new CharMapEntry ( + 0xE, + map [targetASCII].Level1, + latinDiacritical++); + } + + // Greek and Coptic + + // FIXME: this is (mysterious and) incomplete. + for (int i = 0x0380; i < 0x0400; i++) + if (diacritical [i] == 0 && + decompLength [i] == 1 && + decompType [i] == DecompositionCompat) + diacritical [i] = 3; + + fillIndex [0xF] = 2; + for (int i = 0x0391; i < 0x03AA; i++) + if (i != 0x03A2) + AddCharMap ((char) i, 0xF, 1, + diacritical [i]); + fillIndex [0xF] = 2; + for (int i = 0x03B1; i < 0x03CA; i++) + if (i != 0x03C2) + AddCharMap ((char) i, 0xF, 1, + diacritical [i]); + // Final Sigma + map [0x03C2] = new CharMapEntry (0xF, + map [0x03C3].Level1, map [0x03C3].Level2); + fillIndex [0xF] = 0x40; - for (int i = 0x03D0; i < 0x0400; i++) - if (Char.IsLetter ((char) i)) - AddLetterMap ((char) i, 0xF, 1); + for (int i = 0x03DA; i < 0x03F0; i++) + AddCharMap ((char) i, 0xF, + (byte) (i % 2 == 0 ? 0 : 2), + diacritical [i]); + + // NFKD + for (int i = 0x0386; i <= 0x0400; i++) + FillLetterNFKD (i, true, true); + + // Cyrillic. + // Cyrillic letters are sorted like Latin letters i.e. + // containing culture-specific letters between the + // standard Cyrillic sequence. + // + // We can't use UCA here; it has different sorting. + char [] orderedCyrillic = new char [] { + '\u0430', '\u0431', '\u0432', '\u0433', '\u0434', + '\u0452', // DJE for Serbocroatian + '\u0435', + '\u0454', // IE for Ukrainian + '\u0436', '\u0437', + '\u0455', // DZE + '\u0438', + '\u0456', // Byelorussian-Ukrainian I + '\u0457', // YI + '\u0439', + '\u0458', // JE + '\u043A', '\u043B', + '\u0459', // LJE + '\u043C', '\u043D', + '\u045A', // NJE + '\u043E', + // 4E9 goes here. + '\u043F', '\u0440', '\u0441', '\u0442', + '\u045B', // TSHE for Serbocroatian + '\u0443', + '\u045E', // Short U for Byelorussian + '\u04B1', // Straight U w/ stroke (diacritical!) + '\u0444', '\u0445', '\u0446', '\u0447', + '\u045F', // DZHE + '\u0448', '\u0449', '\u044A', '\u044B', '\u044C', + '\u044D', '\u044E', '\u044F'}; + + // For some characters here is a map to basic cyrillic + // letters. See UnicodeData.txt character names for + // the sources. Here I simply declare an equiv. array. + // The content characters are map from U+490(,491), + // skipping small letters. + char [] cymap_src = new char [] { + '\u0433', '\u0433', '\u0433', '\u0436', + '\u0437', '\u043A', '\u043A', '\u043A', + '\u043A', '\u043D', '\u043D', '\u043F', + '\u0445', '\u0441', '\u0442', '\u0443', + '\u0443', '\u0445', '\u0446', '\u0447', + '\u0447', '\u0432', '\u0435', '\u0435', + '\u0406', '\u0436', '\u043A', '\u043D', + '\u0447', '\u0435'}; + + fillIndex [0x10] = 0x8D; + for (int i = 0x0460; i < 0x0481; i++) { + if (Char.IsLetter ((char) i)) { + if (i == 0x0476) + // U+476/477 have the same + // primary weight as U+474/475. + fillIndex [0x10] -= 3; + AddLetterMap ((char) i, 0x10, 3); + } + } - // Cyrillic - character name order fillIndex [0x10] = 0x6; -//* -for (int i = 0; i < orderedCyrillic.Length; i++) -Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); - - // table which is moslty from UCA DUCET. for (int i = 0; i < orderedCyrillic.Length; i++) { char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture); if (!IsIgnorable ((int) c) && - c <= '\u045C' && - Char.IsLetter (c)) { + Char.IsLetter (c) && + !map [c].Defined) { AddLetterMap (c, 0x10, 0); fillIndex [0x10] += 3; } } - /* - for (int i = 0x0460; i < 0x0481; i++) { - if (Char.IsLetter ((char) i)) { - AddLetterMap ((char) i, 0x10, 0); - fillIndex [0x10] += 3; - } - } - */ -/* - for (int i = 0x0400; i <= 0x0486; i++) { - if (!Char.IsLetter ((char) i)) { -// AddCharMap ((char) i, 0x1, 1); - continue; - } - if (!cyrillicLetterPrimaryValues.ContainsKey (i)) { - Console.Error.WriteLine ("no value for {0:x04}", i); - continue; - } - fillIndex [0x10] = - (byte) cyrillicLetterPrimaryValues [i]; - AddLetterMap ((char) i, 0x10, 0); + + // NFKD + for (int i = 0x0401; i <= 0x045F; i++) + FillLetterNFKD (i, false, false); + + for (int i = 0; i < cymap_src.Length; i++) { + char c = cymap_src [i]; + fillIndex [0x10] = map [c].Level1; + int c2 = 0x0490 + i * 2; + AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false); } -*/ // Armenian fillIndex [0x11] = 0x3; - for (int i = 0x0531; i < 0x0586; i++) + fillIndex [0x1] = 0x98; + for (int i = 0x0531; i < 0x0586; i++) { + if (i == 0x0559 || i == 0x55A) + AddCharMap ((char) i, 1, 1); if (Char.IsLetter ((char) i)) AddLetterMap ((char) i, 0x11, 1); + } // Hebrew // -Letters - fillIndex [0x12] = 0x3; + fillIndex [0x12] = 0x2; for (int i = 0x05D0; i < 0x05FF; i++) - if (Char.IsLetter ((char) i)) - AddLetterMap ((char) i, 0x12, 1); + if (Char.IsLetter ((char) i)) { + if (isUppercase [i]) { + fillIndex [0x12]--; + AddLetterMap ((char) i, 0x12, 2); + } + else + AddLetterMap ((char) i, 0x12, 1); + } // -Accents fillIndex [0x1] = 0x3; - for (int i = 0x0591; i <= 0x05C2; i++) + for (int i = 0x0591; i <= 0x05C2; i++) { + if (i == 0x05A3 || i == 0x05BB) + fillIndex [0x1]++; if (i != 0x05BE) AddCharMap ((char) i, 0x1, 1); + } // Arabic fillIndex [0x1] = 0x8E; @@ -2056,14 +2586,33 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); // (byte) arabicLetterPrimaryValues [i], 1); fillIndex [0x13] = (byte) arabicLetterPrimaryValues [i]; - AddLetterMap ((char) i, 0x13, 0); + byte formDiacritical = 8; // default + // SPECIAL CASES: + switch (i) { + case 0x0622: formDiacritical = 9; break; + case 0x0623: formDiacritical = 0xA; break; + case 0x0624: formDiacritical = 5; break; + case 0x0625: formDiacritical = 0xB; break; + case 0x0626: formDiacritical = 7; break; + case 0x0649: formDiacritical = 5; break; + case 0x064A: formDiacritical = 7; break; + } +// AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false); + AddArabicCharMap ((char) i, 0x13, 1, formDiacritical); } + for (int i = 0x0670; i < 0x0673; i++) + map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670)); fillIndex [0x13] = 0x84; for (int i = 0x0674; i < 0x06D6; i++) if (Char.IsLetter ((char) i)) - AddLetterMap ((char) i, 0x13, 1); + AddLetterMapCore ((char) i, 0x13, 1, 0, false); // Devanagari + + // FIXME: this could be fixed in more decent way + for (int i = 0x0958; i <= 0x095F; i++) + diacritical [i] = 8; + // FIXME: it does seem straight codepoint mapping. fillIndex [0x14] = 04; for (int i = 0x0901; i < 0x0905; i++) @@ -2131,10 +2680,16 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); if (c == '\u0A3C' || c == '\u0A4D' || '\u0A66' <= c && c <= '\u0A71') continue; - // SPECIAL CASE: U+A38 = U+A36 at primary level (why?) + // SPECIAL CASES byte shift = 4; - if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E') + switch (c) { + case '\u0A33': case '\u0A36': case '\u0A16': + case '\u0A17': case '\u0A5B': case '\u0A5E': shift = 0; + break; + } + if (c == '\u0A3E') // Skip + fillIndex [0x16] = 0xC0; AddLetterMap (c, 0x16, shift); } @@ -2195,7 +2750,7 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); AddLetterMap ((char) i, 0x1, 1); continue; } - AddLetterMap ((char) i, 0x18, 1); + AddLetterMapCore ((char) i, 0x18, 1, 0, true); } // Tamil @@ -2254,17 +2809,22 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); // Malayalam fillIndex [0x1C] = 2; - for (int i = 0x0D02; i < 0x0D61; i++) + fillIndex [0x1] = 3; + for (int i = 0x0D02; i < 0x0D61; i++) { // FIXME: I avoided MSCompatUnicodeTable usage // here (it results in recursion). So check if // using NonSpacingMark makes sense or not. if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark) // if (!MSCompatUnicodeTable.IsIgnorable ((char) i)) AddCharMap ((char) i, 0x1C, 1); + else if (!IsIgnorable ((char) i)) + AddCharMap ((char) i, 1, 1); + } // Thai ... note that it breaks 0x1E wall after E2B! // Also, all Thai characters have level 2 value 3. fillIndex [0x1E] = 2; + fillIndex [0x1] = 3; for (int i = 0xE40; i <= 0xE44; i++) AddCharMap ((char) i, 0x1E, 1, 3); for (int i = 0xE01; i < 0xE2B; i++) @@ -2279,13 +2839,25 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); char [] specialThai = new char [] {'\u0E45', '\u0E46', '\u0E4E', '\u0E4F', '\u0E5A', '\u0E5B'}; foreach (char c in specialThai) - AddCharMap (c, 0x1F, 1); + AddCharMap (c, 0x1F, 1, 3); + + for (int i = 0xE00; i < 0xE80; i++) + if (Char.GetUnicodeCategory ((char) i) == + UnicodeCategory.NonSpacingMark) + AddCharMap ((char) i, 1, 1); // Lao fillIndex [0x1F] = 2; - for (int i = 0xE80; i < 0xEDF; i++) - if (Char.IsLetter ((char) i)) + fillIndex [0x1] = 3; + for (int i = 0xE80; i < 0xEDF; i++) { + if (IsIgnorable ((char) i)) + continue; + else if (Char.IsLetter ((char) i)) AddCharMap ((char) i, 0x1F, 1); + else if (Char.GetUnicodeCategory ((char) i) == + UnicodeCategory.NonSpacingMark) + AddCharMap ((char) i, 1, 1); + } // Georgian. orderedGeorgian is from UCA DUCET. fillIndex [0x21] = 5; @@ -2362,6 +2934,21 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); AddLetterMap ((char) 0x3093, 0x22, 0); AddLetterMap ((char) (0x3093 + 0x60), 0x22, 0); + map [0x3094] = new CharMapEntry (map [0x30A6].Category, + map [0x30A6].Level1, 3);// voiced hiragana U + map [0x30F4] = new CharMapEntry (map [0x30A6].Category, + map [0x30A6].Level1, 3);// voiced katakana U + + map [0x30F5] = new CharMapEntry (map [0x30AB].Category, + map [0x30AB].Level1, 0);// small katakana Ka + map [0x30F6] = new CharMapEntry (map [0x30B1].Category, + map [0x30B1].Level1, 0);// small katakana Ke + // voiced Wa lines + for (int i = 0x30F7; i < 0x30FB; i++) + map [i] = new CharMapEntry (map [i - 8].Category, + map [i - 8].Level1, + 3); + // JIS Japanese square chars. fillIndex [0x22] = 0x97; jisJapanese.Sort (JISComparer.Instance); @@ -2402,10 +2989,11 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); // Thaana // FIXME: it turned out that it does not look like UCA fillIndex [0x24] = 0x6E; + fillIndex [0x1] = 0xAC; for (int i = 0; i < orderedThaana.Length; i++) { char c = orderedThaana [i]; if (IsIgnorableNonSpacing ((int) c)) - continue; + AddCharMap (c, 1, 1); AddCharMap (c, 0x24, 2); if (c == '\u0782') // SPECIAL CASE: why? fillIndex [0x24] += 2; @@ -2462,9 +3050,9 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >" + "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB," + "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >" - + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, " + + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, " + "\u11F1,, \u11F2,,," - + "\u11EF,,, \u11F0, \u110C=\u11BD,, >" + + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >" + "<\u114D, \u110D,, >" + "<{\u114E \u1151},, \u110E=\u11BE,, >" + "<{\u1152 \u1155},,, \u110F=\u11BF >" @@ -2597,30 +3185,31 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); // PrivateUse ... computed. // remaining Surrogate ... computed. - #region Special "biggest" area (FF FF) - fillIndex [0xFF] = 0xFF; - char [] specialBiggest = new char [] { - '\u3005', '\u3031', '\u3032', '\u309D', - '\u309E', '\u30FC', '\u30FD', '\u30FE', - '\uFE7C', '\uFE7D', '\uFF70'}; - foreach (char c in specialBiggest) - AddCharMap (c, 0xFF, 0); - #endregion - #region 07 - ASCII non-alphanumeric + 3001, 3002 // 07 // non-alphanumeric ASCII except for: + - < = > ' for (int i = 0x21; i < 0x7F; i++) { + // SPECIAL CASE: 02C6 looks regarded as + // equivalent to '^', which does not conform + // to Unicode standard character database. + if (i == 0x005B) + AddCharMap ('\u2045', 0x7, 0, 0x1C); + if (i == 0x005D) + AddCharMap ('\u2046', 0x7, 0, 0x1C); + if (i == 0x005E) + AddCharMap ('\u02C6', 0x7, 0, 3); + if (i == 0x0060) + AddCharMap ('\u02CB', 0x7, 0, 3); + if (Char.IsLetterOrDigit ((char) i) || "+-<=>'".IndexOf ((char) i) >= 0) continue; // they are not added here. - AddCharMapGroup2 ((char) i, 0x7, 1, 0); + + AddCharMapGroup2 ((char) i, 0x7, 1, 0); // Insert 3001 after ',' and 3002 after '.' if (i == 0x2C) AddCharMapGroup2 ('\u3001', 0x7, 1, 0); - else if (i == 0x2E) { - fillIndex [0x7]--; + else if (i == 0x2E) AddCharMapGroup2 ('\u3002', 0x7, 1, 0); - } else if (i == 0x3A) AddCharMap ('\uFE30', 0x7, 1, 0); } @@ -2633,16 +3222,35 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); // FIXME: actually those reset should not be // done but here I put for easy goal. + if (i == 0x05C3) + fillIndex [0x7]++; if (i == 0x0700) fillIndex [0x7] = 0xE2; if (i == 0x2016) fillIndex [0x7] = 0x77; + if (i == 0x3008) + fillIndex [0x7] = 0x93; + + if (0x02C8 <= i && i <= 0x02CD) + continue; // nonspacing marks + + // SPECIAL CASE: maybe they could be allocated + // dummy NFKD mapping and no special processing + // would be required here. + if (i == 0x00AF) + AddCharMap ('\u02C9', 0x7, 0, 3); + if (i == 0x00B4) + AddCharMap ('\u02CA', 0x7, 0, 3); + if (i == 0x02C7) + AddCharMap ('\u02D8', 0x7, 0, 3); // SPECIAL CASES: switch (i) { case 0xAB: // 08 case 0xB7: // 0A case 0xBB: // 08 + case 0x02B9: // 01 + case 0x02BA: // 01 case 0x2329: // 09 case 0x232A: // 09 continue; @@ -2652,29 +3260,106 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); case UnicodeCategory.OtherPunctuation: case UnicodeCategory.ClosePunctuation: case UnicodeCategory.OpenPunctuation: + case UnicodeCategory.ConnectorPunctuation: case UnicodeCategory.InitialQuotePunctuation: case UnicodeCategory.FinalQuotePunctuation: case UnicodeCategory.ModifierSymbol: // SPECIAL CASES: // 0xA - if (0x2020 <= i && i <= 0x2042) + if (0x2020 <= i && i <= 0x2031) continue; - AddCharMapGroup ((char) i, 0x7, 1, 0); + if (i == 0x3003) // added later + continue; + AddCharMapGroup2 ((char) i, 0x7, 1, 0); break; default: - if (i == 0xA6) // SPECIAL CASE. FIXME: why? + if (i == 0xA6 || i == 0x1C3 || i == 0x037A) // SPECIAL CASE. FIXME: why? goto case UnicodeCategory.OtherPunctuation; break; } } + // Control pictures // FIXME: it should not need to reset level 1, but // it's for easy goal. fillIndex [0x7] = 0xB6; - for (int i = 0x2400; i <= 0x2421; i++) + for (int i = 0x2400; i <= 0x2424; i++) AddCharMap ((char) i, 0x7, 1, 0); + + // FIXME: what are they? + AddCharMap ('\u3003', 0x7, 1); + AddCharMap ('\u3006', 0x7, 1); + AddCharMap ('\u02D0', 0x7, 1); + AddCharMap ('\u10FB', 0x7, 1); + AddCharMap ('\u0950', 0x7, 1); + AddCharMap ('\u093D', 0x7, 1); + AddCharMap ('\u0964', 0x7, 1); + AddCharMap ('\u0965', 0x7, 1); + AddCharMap ('\u0970', 0x7, 1); + + #endregion + + #region category 08 - symbols + fillIndex [0x8] = 2; + // Here Windows mapping is not straightforward. It is + // not based on computation but seems manual sorting. + AddCharMapGroup ('+', 0x8, 1, 0); // plus + AddCharMapGroup ('\u2212', 0x8, 1); // minus + AddCharMapGroup ('\u229D', 0x8, 1); // minus + AddCharMapGroup ('\u2297', 0x8, 1); // mul + AddCharMapGroup ('\u2044', 0x8, 1); // div + AddCharMapGroup ('\u2215', 0x8, 0); // div + AddCharMapGroup ('\u2298', 0x8, 1); // div slash + AddCharMapGroup ('\u2217', 0x8, 0); // mul + AddCharMapGroup ('\u229B', 0x8, 1); // asterisk oper + AddCharMapGroup ('\u2218', 0x8, 0); // ring + AddCharMapGroup ('\u229A', 0x8, 1); // ring + AddCharMapGroup ('\u2219', 0x8, 0); // bullet + AddCharMapGroup ('\u2299', 0x8, 1); // dot oper + AddCharMapGroup ('\u2213', 0x8, 1); // minus-or-plus + AddCharMapGroup ('\u003C', 0x8, 1); // < + AddCharMapGroup ('\u227A', 0x8, 1); // precedes relation + AddCharMapGroup ('\u22B0', 0x8, 1); // precedes under relation + + for (int cp = 0; cp < 0x2300; cp++) { + if (cp == 0xAC) // SPECIAL CASE: skip + continue; + if (cp == 0x200) { + cp = 0x2200; // skip to 2200 + fillIndex [0x8] = 0x21; + } + if (cp == 0x2295) + fillIndex [0x8] = 0x3; + if (cp == 0x22A2) + fillIndex [0x8] = 0xAB; + if (cp == 0x22B2) + fillIndex [0x8] = 0xB9; + if (!map [cp].Defined && +// Char.GetUnicodeCategory ((char) cp) == +// UnicodeCategory.MathSymbol) + Char.IsSymbol ((char) cp)) + AddCharMapGroup ((char) cp, 0x8, 1); + // SPECIAL CASES: no idea why Windows sorts as such + switch (cp) { + case 0x3E: + AddCharMap ('\u227B', 0x8, 1, 0); + AddCharMap ('\u22B1', 0x8, 1, 0); + break; + case 0xB1: + AddCharMapGroup ('\u00AB', 0x8, 1); + AddCharMapGroup ('\u226A', 0x8, 1); + AddCharMapGroup ('\u00BB', 0x8, 1); + AddCharMapGroup ('\u226B', 0x8, 1); + break; + case 0xF7: + AddCharMap ('\u01C0', 0x8, 1, 0); + AddCharMap ('\u01C1', 0x8, 1, 0); + AddCharMap ('\u01C2', 0x8, 1, 0); + break; + } + } #endregion - // FIXME: for 07 xx we need more love. + #region Hack! // Characters w/ diacritical marks (NFKD) for (int i = 0; i <= char.MaxValue; i++) { @@ -2685,7 +3370,7 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); int start = decompIndex [i]; int primaryChar = decompValues [start]; - int secondary = 0; + int secondary = diacritical [i]; bool skip = false; int length = decompLength [i]; // special processing for parenthesized ones. @@ -2714,59 +3399,8 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); } - // category 08 - symbols - fillIndex [0x8] = 2; - // Here Windows mapping is not straightforward. It is - // not based on computation but seems manual sorting. - AddCharMapGroup ('+', 0x8, 1, 0); // plus - AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus - AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus - AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul - AddCharMapGroup ('\u2044', 0x8, 1, 0); // div - AddCharMapGroup ('\u2215', 0x8, 1, 0); // div - AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul - AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring - AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet - AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus - AddCharMapGroup ('\u003C', 0x8, 1, 0); // < - AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation - AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation + // Diacritical weight adjustment - for (int cp = 0; cp < 0x2300; cp++) { - if (cp == 0xAC) // SPECIAL CASE: skip - continue; - if (cp == 0x200) { - cp = 0x2200; // skip to 2200 - fillIndex [0x8] = 0x21; - } - if (cp == 0x2295) - fillIndex [0x8] = 0x3; - if (!map [cp].Defined && -// Char.GetUnicodeCategory ((char) cp) == -// UnicodeCategory.MathSymbol) - Char.IsSymbol ((char) cp)) - AddCharMapGroup ((char) cp, 0x8, 1, 0); - // SPECIAL CASES: no idea why Windows sorts as such - switch (cp) { - case 0x3E: - AddCharMap ('\u227B', 0x8, 1, 0); - AddCharMap ('\u22B1', 0x8, 1, 0); - break; - case 0xB1: - AddCharMapGroup ('\u00AB', 0x8, 1, 0); - AddCharMapGroup ('\u226A', 0x8, 1, 0); - AddCharMapGroup ('\u00BB', 0x8, 1, 0); - AddCharMapGroup ('\u226B', 0x8, 1, 0); - break; - case 0xF7: - AddCharMap ('\u01C0', 0x8, 1, 0); - AddCharMap ('\u01C1', 0x8, 1, 0); - AddCharMap ('\u01C2', 0x8, 1, 0); - break; - } - } - - #region Level2 adjustment // Arabic Hamzah diacritical [0x624] = 0x5; diacritical [0x626] = 0x7; @@ -2785,6 +3419,10 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); mod = diacritical [i]; break; case 0x13: // Arabic + if (i == 0x0621) + break; // 0 + if (diacritical [i] == 0 && decompLength [i] != 0) + diacritical [i] = map [decompValues [decompIndex [i]]].Level2; if (diacritical [i] == 0 && i >= 0xFE8D) mod = 0x8; // default for arabic break; @@ -2795,25 +3433,79 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); map [i] = new CharMapEntry ( cat, map [i].Level1, mod); } - #endregion - // FIXME: this is hack but those NonSpacingMark + // FIXME: this is halfly hack but those NonSpacingMark // characters and still undefined are likely to // be nonspacing. - for (int i = 0; i < char.MaxValue; i++) - if (!map [i].Defined && - !IsIgnorable (i) && - Char.GetUnicodeCategory ((char) i) == + for (int i = 0; i < char.MaxValue; i++) { + if (map [i].Defined || + IsIgnorable (i)) + continue; + switch (i) { + // SPECIAL CASES. + case 0x02B9: + case 0x02BA: + break; + default: + if (Char.GetUnicodeCategory ((char) i) != UnicodeCategory.NonSpacingMark) + continue; + break; + } + if (diacritical [i] != 0) + map [i] = new CharMapEntry (1, 1, diacritical [i]); + else AddCharMap ((char) i, 1, 1); + } - // FIXME: this is hack but those Symbol characters - // are likely to fall into 0xA category. - for (int i = 0; i < char.MaxValue; i++) - if (!map [i].Defined && - !IsIgnorable (i) && - Char.IsSymbol ((char) i)) - AddCharMap ((char) i, 0xA, 1); + #endregion + } + + TextInfo ti = CultureInfo.InvariantCulture.TextInfo; + + private void FillLetterNFKD (int i, bool checkUpper, bool greekRemap) + { + if (map [i].Defined) + return; + int up = (int) ti.ToUpper ((char) i); + if (checkUpper && map [up].Category == 0xF) { + if (i == up) + return; + FillLetterNFKD (up, checkUpper, greekRemap); + map [i] = new CharMapEntry (0xF, + map [up].Level1, + map [up].Level2); + } else { + int idx = decompIndex [i]; + if (idx == 0) + return; + int primary = decompValues [decompIndex [i]]; + FillLetterNFKD (primary, checkUpper, greekRemap); + + int lv2 = map [primary].Level2; + byte off = 0; + for (int l = 1; l < decompLength [i]; l++) { + int tmp = decompValues [idx + l]; + if (map [tmp].Category != 1) + return; + if (greekRemap && map [tmp].Level2 == 0xC) + off += 3; + else + off += map [tmp].Level2; + } + if (off > 0) { + if (lv2 == 0) + lv2 += 2; + lv2 += off; + } + // ... but override if the value already exists. + if (diacritical [i] != 0) + lv2 = diacritical [i]; + map [i] = new CharMapEntry ( + map [primary].Category, + map [primary].Level1, + (byte) lv2); + } } private void IncrementSequentialIndex (ref byte hangulCat) @@ -2845,32 +3537,32 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); char c = (char) (i + b); byte arg = (byte) (b > 0 ? b + 2 : 0); // Hiragana - AddLetterMapCore (c, 0x22, 0, arg); + AddLetterMapCore (c, 0x22, 0, arg, false); // Katakana - AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg); + AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false); } } private void AddLetterMap (char c, byte category, byte updateCount) { - AddLetterMapCore (c, category, updateCount, 0); + AddLetterMapCore (c, category, updateCount, 0, true); } - private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2) + private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2) { char c2; // updates index c2 = ToSmallForm (c); if (c2 != c) - AddCharMapGroup (c2, category, updateCount, level2); + AddCharMapGroup (c2, category, updateCount, level2, deferLevel2); c2 = Char.ToLower (c, CultureInfo.InvariantCulture); if (c2 != c && !map [(int) c2].Defined) - AddLetterMapCore (c2, category, 0, level2); + AddLetterMapCore (c2, category, 0, level2, deferLevel2); bool doUpdate = true; if (IsIgnorable ((int) c) || map [(int) c].Defined) doUpdate = false; else - AddCharMapGroup (c, category, 0, level2); + AddCharMapGroup (c, category, 0, level2, deferLevel2); if (doUpdate) fillIndex [category] += updateCount; } @@ -2891,19 +3583,6 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); return true; } - private void AddCharMapGroupTail (char c, byte category, byte updateCount) - { - char c2 = ToSmallFormTail (c); - if (c2 != c) - AddCharMap (c2, category, updateCount, 0); - // itself - AddCharMap (c, category, updateCount, 0); - // - c2 = ToFullWidthTail (c); - if (c2 != c) - AddCharMapGroupTail (c2, category, updateCount); - } - // // Adds characters to table in the order below // (+ increases weight): @@ -2925,11 +3604,24 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); DecompositionWide, DecompositionNarrow, }; + private void AddCharMapGroup (char c, byte category, byte updateCount) + { + AddCharMapGroup (c, category, updateCount, 0, true); + } + private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2) + { + AddCharMapGroup (c, category, updateCount, level2, false); + } + + private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2) { if (map [(int) c].Defined) return; + if (deferLevel2) + level2 = diacritical [(int) c]; + char small = char.MinValue; char vertical = char.MinValue; Hashtable nfkd = (Hashtable) nfkdMap [(int) c]; @@ -2943,8 +3635,11 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); } // updates index - if (small != char.MinValue) - AddCharMap (small, category, updateCount); + if (small != char.MinValue) { + if (level2 == 0 && deferLevel2) + level2 = diacritical [small]; + AddCharMap (small, category, updateCount, level2); + } // itself AddCharMap (c, category, 0, level2); @@ -2952,16 +3647,22 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); if (nfkd != null) { foreach (int weight in sameWeightItems) { object wv = nfkd [(byte) weight]; - if (wv != null) + if (wv != null) { + if (deferLevel2) + level2 = diacritical [(int) wv]; AddCharMap ((char) ((int) wv), category, 0, level2); + } } } // update index here. fillIndex [category] += updateCount; - if (vertical != char.MinValue) + if (vertical != char.MinValue) { + if (level2 == 0 && deferLevel2) + level2 = diacritical [vertical]; AddCharMap (vertical, category, updateCount, level2); + } } private void AddCharMapCJK (char c, ref byte category) @@ -3024,23 +3725,44 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); // For now it is only for 0x7 category. private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2) { - char small = char.MinValue; - char vertical = char.MinValue; - Hashtable nfkd = (Hashtable) nfkdMap [(int) c]; - if (nfkd != null) { - object smv = nfkd [(byte) DecompositionSmall]; - if (smv != null) - small = (char) ((int) smv); - object vv = nfkd [(byte) DecompositionVertical]; - if (vv != null) - vertical = (char) ((int) vv); + if (map [(int) c].Defined) + return; + + bool updateWeight = false; + // Process in advance (lower primary weight) + for (int c2 = 0; c2 < char.MaxValue; c2++) { + if (!map [c2].Defined && + decompLength [c2] == 1 && + (int) (decompValues [decompIndex [c2]]) == (int) c) { + switch (decompType [c2]) { + case DecompositionSmall: + updateWeight = true; + AddCharMap ((char) c2, category, + 0, level2); + break; + } + } } + if (updateWeight) + fillIndex [category] = (byte) + (fillIndex [category] + updateCount); - // updates index - if (small != char.MinValue) - // SPECIAL CASE excluded (FIXME: why?) - if (small != '\u2024') - AddCharMap (small, category, updateCount); + // Identical weight + for (int c2 = 0; c2 < char.MaxValue; c2++) { + if (!map [c2].Defined && + decompLength [c2] == 1 && + (int) (decompValues [decompIndex [c2]]) == (int) c) { + switch (decompType [c2]) { + case DecompositionSub: + case DecompositionSuper: + case DecompositionWide: + case DecompositionNarrow: + AddCharMap ((char) c2, category, + 0, level2); + break; + } + } + } // itself AddCharMap (c, category, updateCount, level2); @@ -3048,28 +3770,26 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); // Since nfkdMap is problematic to have two or more // NFKD to an identical character, here I iterate all. for (int c2 = 0; c2 < char.MaxValue; c2++) { - if (decompLength [c2] == 1 && + if (!map [c2].Defined && + decompLength [c2] == 1 && (int) (decompValues [decompIndex [c2]]) == (int) c) { switch (decompType [c2]) { - case DecompositionCompat: + case DecompositionWide: + case DecompositionNarrow: + case DecompositionSmall: + case DecompositionSub: + case DecompositionSuper: + continue; + default: AddCharMap ((char) c2, category, updateCount, level2); break; } } } - - if (vertical != char.MinValue) - // SPECIAL CASE excluded (FIXME: why?) - if (vertical != '\uFE33' && vertical != '\uFE34') - AddCharMap (vertical, category, updateCount, level2); } - private void AddArabicCharMap (char c) + private void AddArabicCharMap (char c, byte category, byte updateCount, byte level2) { - byte category = 6; - byte updateCount = 1; - byte level2 = 0; - // itself AddCharMap (c, category, 0, level2); @@ -3086,26 +3806,11 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); fillIndex [category] += updateCount; } - char ToFullWidth (char c) - { - return ToDecomposed (c, DecompositionFull, false); - } - - char ToFullWidthTail (char c) - { - return ToDecomposed (c, DecompositionFull, true); - } - char ToSmallForm (char c) { return ToDecomposed (c, DecompositionSmall, false); } - char ToSmallFormTail (char c) - { - return ToDecomposed (c, DecompositionSmall, true); - } - char ToDecomposed (char c, byte d, bool tail) { if (decompType [(int) c] != d) @@ -3139,9 +3844,27 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); // CJK compat if ('\u3192' <= c && c <= '\u319F') return 0; - // Japanese reading marks - if (c == '\u3001' || c == '\u3002') - return 2; + + // They have NFKD mapping, and on Windows + // those narrow characters are regarded as "normal", + // thus those characters themselves are regarded as + // "wide". grep "" and you can pick them up + // (ignoring Kana, Hangul etc.) + switch (c) { + case '\u3002': + case '\u300C': + case '\u300D': + case '\u3001': + case '\u30FB': + case '\u2502': + case '\u2190': + case '\u2191': + case '\u2192': + case '\u2193': + case '\u25A0': + case '\u25CB': + return 1; + } // Korean if ('\u11A8' <= c && c <= '\u11F9') return 2; @@ -3164,22 +3887,32 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); if ('\u2160' <= c && c <= '\u216F') return 0x10; if ('\u2181' <= c && c <= '\u2182') - return 0x18; + return 0x10; // Arabic if ('\u2135' <= c && c <= '\u2138') return 4; - if ('\uFE80' <= c && c < '\uFF00') { + // I believe that Windows has a bug on setting level 3 + // weight here. NFKD results in different values. + if ('\uFE80' < c && c < '\uFF00') { // 2(Isolated)/8(Final)/0x18(Medial) switch (decompType [(int) c]) { case DecompositionIsolated: - return 2; + return 0; // 2; case DecompositionFinal: return 8; case DecompositionMedial: return 0x18; + case DecompositionInitial: + return 0x10; } } + // I have no idea why those symbols have level 3 weight + if (c == '\u2104' || c == '\u212B') + return 0x18; + if ('\u211E' <= c && c <= '\u212B') + return 0x10; + // actually I dunno the reason why they have weights. switch (c) { case '\u01BC': @@ -3188,17 +3921,23 @@ Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]); return 0x20; case '\u06AA': return 0x28; + // Gurmukhi + case '\u0A39': + case '\u0A59': + case '\u0A5A': + case '\u0A5B': + case '\u0A5E': + return 0x10; } byte ret = 0; switch (c) { case '\u03C2': - case '\u2104': case '\u212B': - ret |= 8; + ret = 8; break; case '\uFE42': - ret |= 0xC; + ret = 0xA; break; }