//
+// create-mscompat-collation-table.cs : generates Windows-like sortkey tables.
+//
+// Author:
+// Atsushi Enomoto <atsushi@ximian.com>
+//
+// Copyright (C) 2005 Novell, Inc (http://www.novell.com)
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+//
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
//
// There are two kind of sort keys : which are computed and which are laid out
// as an indexed array. Computed sort keys are:
// - Surrogate
// - PrivateUse
//
-// Also, for composite characters it should prepare different index table.
-//
// Though it is possible to "compute" level 3 weights, they are still dumped
// to an array to avoid execution cost.
//
-
-//
-// * sortkey getter signature
-//
-// int GetSortKey (string s, int index, SortKeyBuffer buf)
-// Stores sort key for corresponding character element into buf and
-// returns the length of the consumed _source_ character element in s.
-//
-// * character length to consume
-//
-// If there are characters whose primary weight is 0, they are consumed
-// and considered as a part of the character element.
-//
#define Binary
using System;
using System.Text;
using System.Xml;
+using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
+
namespace Mono.Globalization.Unicode
{
internal class MSCompatSortKeyTableGenerator
const int DecompositionCompat = 0x11;
const int DecompositionCanonical = 0x12;
- TextWriter Result = Console.Out;
+ TextWriter CSResult = Console.Out;
+ TextWriter CResult = TextWriter.Null;
byte [] fillIndex = new byte [256]; // by category
CharMapEntry [] map = new CharMapEntry [char.MaxValue + 1];
string [] diacritics = new string [] {
// LATIN, CYRILLIC etc.
- "UPTURN", "DOUBLE-STRUCK",
- "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;",
- "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;",
- "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
- "WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
- " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
+ "VERTICAL LINE ABOVE", "UPTURN", "DOUBLE-STRUCK",
+ "ABKHASIAN",
+ "MIDDLE HOOK", "WITH VERTICAL LINE ABOVE;", "WITH TONOS",
+ "WITH ACUTE ACCENT;", "WITH GRAVE ACCENT;",
+ "WITH ACUTE;", "WITH GRAVE;",
+ //
+ "WITH DOT ABOVE;", " MIDDLE DOT;",
+ "WITH CIRCUMFLEX ACCENT;", "WITH CIRCUMFLEX;",
+ "WITH DIALYTIKA;",
+ "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
+ "DIALYTIKA TONOS", "DIALYTIKA AND TONOS",
+ "ABKHASIAN CHE WITH DESCENDER",
+ "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
"WITH OGONEK;", "WITH CEDILLA;",
//
" DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
" BREVE AND TILDE",
" CEDILLA AND BREVE",
" OGONEK AND MACRON",
- //
- "WITH OVERLINE",
+ // 0x40
+ "WITH OVERLINE", "DOUBLE VERTICAL LINE ABOVE",
"WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
" DOUBLE GRAVE",
" INVERTED BREVE",
" LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
" PALATAL HOOK",
" DOT BELOW;",
- " RETROFLEX;", "DIAERESIS BELOW",
- " RING BELOW",
+ " RETROFLEX;", "DIAERESIS BELOW", "RETROFLEX HOOK",
+ " RING BELOW", "LOW VERTICAL LINE",
//
" CIRCUMFLEX BELOW", "HORN AND ACUTE",
" BREVE BELOW;", " HORN AND GRAVE",
+ " LOW MACRON",
" TILDE BELOW",
" TOPBAR",
" DOT BELOW AND DOT ABOVE",
};
byte [] diacriticWeights = new byte [] {
// LATIN.
- 3, 3, 5, 5,
- 0xF, 0xE, 0x12,
- 0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
- 0x17, 0x19, 0x1A, 0x1B, 0x1C,
+ 3, 3, 3, 5, 5, 5, 5,
+ 0xE, 0xF,
+ 0xE, 0xF,
+ //
+ 0x10, 0x11, 0x12, 0x12, 0x13, 0x13, 0x14, 0x15, 0x16,
+ 0x16, 0x17, 0x17, 0x19, 0x1A, 0x1B, 0x1C,
//
0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
//
- 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
- 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
+ 0x40, 0x41, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
+ 0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x59,
+ 0x5A, 0x5A,
//
- 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
+ 0x60, 0x60, 0x61, 0x61, 0x62, 0x63, 0x68, 0x68,
0x69, 0x69, 0x6A, 0x6D, 0x6E,
0x87, 0x95, 0xAA,
// CIRCLED, PARENTHESIZED and so on.
ModifyParsedValues ();
GenerateCore ();
Console.Error.WriteLine ("generation done.");
+ CResult = new StreamWriter ("collation-tables.h", false);
Serialize ();
+ CResult.Close ();
Console.Error.WriteLine ("serialization done.");
/*
StreamWriter sw = new StreamWriter ("agelog.txt");
source, typeof (ushort), i);
}
+ void WriteByte (byte value)
+ {
+
+ }
+
void Serialize ()
{
// Tailorings
byte [] level1 = new byte [map.Length];
byte [] level2 = new byte [map.Length];
byte [] level3 = new byte [map.Length];
- ushort [] widthCompat = new ushort [map.Length];
+// widthCompat is now removed from the mapping table.
+// If it turned out that it is still required, grep this source and uncomment
+// widthCompat related lines. FIXME: remove those lines in the future.
+// ushort [] widthCompat = new ushort [map.Length];
for (int i = 0; i < map.Length; i++) {
categories [i] = map [i].Category;
level1 [i] = map [i].Level1;
level2 [i] = map [i].Level2;
level3 [i] = ComputeLevel3Weight ((char) i);
+/*
// For Japanese Half-width characters, don't
// map widthCompat. It is IgnoreKanaType that
// handles those width differences.
widthCompat [i] = (ushort) decompValues [decompIndex [i]];
break;
}
+*/
}
// compress
ignorableFlags = CompressArray (ignorableFlags,
- MSCompatUnicodeTableUtil.Ignorable);
- categories = CompressArray (categories,
- MSCompatUnicodeTableUtil.Category);
- level1 = CompressArray (level1,
- MSCompatUnicodeTableUtil.Level1);
- level2 = CompressArray (level2,
- MSCompatUnicodeTableUtil.Level2);
- level3 = CompressArray (level3,
- MSCompatUnicodeTableUtil.Level3);
- widthCompat = (ushort []) CodePointIndexer.CompressArray (
- widthCompat, typeof (ushort),
- MSCompatUnicodeTableUtil.WidthCompat);
- cjkCHS = CompressArray (cjkCHS,
- MSCompatUnicodeTableUtil.CjkCHS);
- cjkCHT = CompressArray (cjkCHT,
- MSCompatUnicodeTableUtil.Cjk);
- cjkJA = CompressArray (cjkJA,
- MSCompatUnicodeTableUtil.Cjk);
- cjkKO = CompressArray (cjkKO,
- MSCompatUnicodeTableUtil.Cjk);
- cjkKOlv2 = CompressArray (cjkKOlv2,
- MSCompatUnicodeTableUtil.Cjk);
+ UUtil.Ignorable);
+ categories = CompressArray (categories, UUtil.Category);
+ level1 = CompressArray (level1, UUtil.Level1);
+ level2 = CompressArray (level2, UUtil.Level2);
+ level3 = CompressArray (level3, UUtil.Level3);
+// widthCompat = (ushort []) CodePointIndexer.CompressArray (
+// widthCompat, typeof (ushort), UUtil.WidthCompat);
+ cjkCHS = CompressArray (cjkCHS, UUtil.CjkCHS);
+ cjkCHT = CompressArray (cjkCHT,UUtil.Cjk);
+ cjkJA = CompressArray (cjkJA, UUtil.Cjk);
+ cjkKO = CompressArray (cjkKO, UUtil.Cjk);
+ cjkKOlv2 = CompressArray (cjkKOlv2, UUtil.Cjk);
// Ignorables
- Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
+ CResult.WriteLine ("static const guint8 collation_table_ignorableFlags [] = {");
+ CSResult.WriteLine ("static readonly byte [] ignorableFlagsArr = new byte [] {");
#if Binary
MemoryStream ms = new MemoryStream ();
BinaryWriter binary = new BinaryWriter (ms);
+ binary.Write (UUtil.ResourceVersion);
binary.Write (ignorableFlags.Length);
#endif
for (int i = 0; i < ignorableFlags.Length; i++) {
byte value = ignorableFlags [i];
if (value < 10)
- Result.Write ("{0},", value);
+ CSResult.Write ("{0},", value);
else
- Result.Write ("0x{0:X02},", value);
+ CSResult.Write ("0x{0:X02},", value);
+ CResult.Write ("{0},", value);
#if Binary
binary.Write (value);
#endif
- if ((i & 0xF) == 0xF)
- Result.WriteLine ("// {0:X04}", i - 0xF);
+ if ((i & 0xF) == 0xF) {
+ CSResult.WriteLine ("// {0:X04}",
+ UUtil.Ignorable.ToCodePoint (i - 0xF));
+ CResult.WriteLine ();
+ }
}
- Result.WriteLine ("};");
- Result.WriteLine ();
+ CResult.WriteLine ("0};");
+ CSResult.WriteLine ("};");
+ CSResult.WriteLine ();
// Primary category
- Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
+ CResult.WriteLine ("static const guint8 collation_table_category [] = {");
+ CSResult.WriteLine ("static readonly byte [] categoriesArr = new byte [] {");
#if Binary
binary.Write (categories.Length);
#endif
for (int i = 0; i < categories.Length; i++) {
byte value = categories [i];
if (value < 10)
- Result.Write ("{0},", value);
+ CSResult.Write ("{0},", value);
else
- Result.Write ("0x{0:X02},", value);
+ CSResult.Write ("0x{0:X02},", value);
+ CResult.Write ("{0},", value);
#if Binary
binary.Write (value);
#endif
- if ((i & 0xF) == 0xF)
- Result.WriteLine ("// {0:X04}", i - 0xF);
+ if ((i & 0xF) == 0xF) {
+ CSResult.WriteLine ("// {0:X04}",
+ UUtil.Category.ToCodePoint (i - 0xF));
+ CResult.WriteLine ();
+ }
}
- Result.WriteLine ("};");
- Result.WriteLine ();
+ CResult.WriteLine ("};");
+ CSResult.WriteLine ("};");
+ CSResult.WriteLine ();
// Primary weight value
- Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
+ CResult.WriteLine ("static const guint8 collation_table_level1 [] = {");
+ CSResult.WriteLine ("static readonly byte [] level1Arr = new byte [] {");
#if Binary
binary.Write (level1.Length);
#endif
for (int i = 0; i < level1.Length; i++) {
byte value = level1 [i];
if (value < 10)
- Result.Write ("{0},", value);
+ CSResult.Write ("{0},", value);
else
- Result.Write ("0x{0:X02},", value);
+ CSResult.Write ("0x{0:X02},", value);
+ CResult.Write ("{0},", value);
#if Binary
binary.Write (value);
#endif
- if ((i & 0xF) == 0xF)
- Result.WriteLine ("// {0:X04}", i - 0xF);
+ if ((i & 0xF) == 0xF) {
+ CSResult.WriteLine ("// {0:X04}",
+ UUtil.Level1.ToCodePoint (i - 0xF));
+ CResult.WriteLine ();
+ }
}
- Result.WriteLine ("};");
- Result.WriteLine ();
+ CResult.WriteLine ("0};");
+ CSResult.WriteLine ("};");
+ CSResult.WriteLine ();
// Secondary weight
- Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
+ CResult.WriteLine ("static const guint8 collation_table_level2 [] = {");
+ CSResult.WriteLine ("static readonly byte [] level2Arr = new byte [] {");
#if Binary
binary.Write (level2.Length);
#endif
for (int i = 0; i < level2.Length; i++) {
byte value = level2 [i];
if (value < 10)
- Result.Write ("{0},", value);
+ CSResult.Write ("{0},", value);
else
- Result.Write ("0x{0:X02},", value);
+ CSResult.Write ("0x{0:X02},", value);
+ CResult.Write ("{0},", value);
#if Binary
binary.Write (value);
#endif
- if ((i & 0xF) == 0xF)
- Result.WriteLine ("// {0:X04}", i - 0xF);
+ if ((i & 0xF) == 0xF) {
+ CSResult.WriteLine ("// {0:X04}",
+ UUtil.Level2.ToCodePoint (i - 0xF));
+ CResult.WriteLine ();
+ }
}
- Result.WriteLine ("};");
- Result.WriteLine ();
+ CResult.WriteLine ("0};");
+ CSResult.WriteLine ("};");
+ CSResult.WriteLine ();
// Thirtiary weight
- Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
+ CResult.WriteLine ("static const guint8 collation_table_level3 [] = {");
+ CSResult.WriteLine ("static readonly byte [] level3Arr = new byte [] {");
#if Binary
binary.Write (level3.Length);
#endif
for (int i = 0; i < level3.Length; i++) {
byte value = level3 [i];
if (value < 10)
- Result.Write ("{0},", value);
+ CSResult.Write ("{0},", value);
else
- Result.Write ("0x{0:X02},", value);
+ CSResult.Write ("0x{0:X02},", value);
+ CResult.Write ("{0},", value);
#if Binary
binary.Write (value);
#endif
- if ((i & 0xF) == 0xF)
- Result.WriteLine ("// {0:X04}", i - 0xF);
+ if ((i & 0xF) == 0xF) {
+ CSResult.WriteLine ("// {0:X04}",
+ UUtil.Level3.ToCodePoint (i - 0xF));
+ CResult.WriteLine ();
+ }
}
- Result.WriteLine ("};");
- Result.WriteLine ();
+ CResult.WriteLine ("0};");
+ CSResult.WriteLine ("};");
+ CSResult.WriteLine ();
+/*
// Width insensitivity mappings
// (for now it is more lightweight than dumping the
// entire NFKD table).
- Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
+ CResult.WriteLine ("static const guint16* widthCompat [] = {");
+ CSResult.WriteLine ("static readonly ushort [] widthCompatArr = new ushort [] {");
#if Binary
binary.Write (widthCompat.Length);
#endif
for (int i = 0; i < widthCompat.Length; i++) {
ushort value = widthCompat [i];
if (value < 10)
- Result.Write ("{0},", value);
+ CSResult.Write ("{0},", value);
else
- Result.Write ("0x{0:X02},", value);
+ CSResult.Write ("0x{0:X02},", value);
+ CResult.Write ("{0},", value);
#if Binary
binary.Write (value);
#endif
- if ((i & 0xF) == 0xF)
- Result.WriteLine ("// {0:X04}", i - 0xF);
+ if ((i & 0xF) == 0xF) {
+ CSResult.WriteLine ("// {0:X04}",
+ UUtil.WidthCompat.ToCodePoint (i - 0xF));
+ CResult.WriteLine ();
+ }
}
- Result.WriteLine ("};");
- Result.WriteLine ();
+ CResult.WriteLine ("0};");
+ CSResult.WriteLine ("};");
+ CSResult.WriteLine ();
+*/
+
#if Binary
- using (FileStream fs = File.Create ("../collation.core.bin")) {
+ using (FileStream fs = File.Create ("../resources/collation.core.bin")) {
byte [] array = ms.ToArray ();
fs.Write (array, 0, array.Length);
}
SerializeCJK ("cjkKOlv2", cjkKOlv2, 0x9FB0);
}
- void SerializeCJK (string name, ushort [] cjk, int max)
+ void SerializeCJK (string name, ushort [] cjk, int max_unused)
{
- int offset = 0;//char.MaxValue - cjk.Length;
- Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
+// CResult.WriteLine ("static const int collation_table_collation_cjk_{0}_size [] = {1};", name, cjk.Length);
+ CSResult.WriteLine ("const int {0}ArrLength = {1};", name, cjk.Length);
+
+ int len = cjk.Length;
+ CResult.WriteLine ("static const guint8 collation_table_collation_cjk_{0} [] = {{", name);
+ CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name);
+ // the actual length is *2
+ for (int i = 0; i < 4; i++, len /= 256) {
+ CResult.Write ("{0},", len & 0xFF);
+ CSResult.Write ("0x{0:X04},", len & 0xFF);
+ }
+ CResult.WriteLine ();
+ CSResult.WriteLine ();
#if Binary
MemoryStream ms = new MemoryStream ();
BinaryWriter binary = new BinaryWriter (ms);
- binary.Write (cjk.Length);
+ binary.Write (UUtil.ResourceVersion);
+ binary.Write (cjk.Length); // the actual size is *2.
#endif
+ // category
for (int i = 0; i < cjk.Length; i++) {
- if (i + offset == max)
- break;
- ushort value = cjk [i];
+// if (i == max)
+// break;
+ byte value = (byte) (cjk [i] >> 8);
+ if (value < 10)
+ CSResult.Write ("{0},", value);
+ else
+ CSResult.Write ("0x{0:X02},", value);
+ CResult.Write ("{0},", value);
+#if Binary
+ binary.Write (value);
+#endif
+ if ((i & 0xF) == 0xF) {
+ CSResult.WriteLine ("// {0:X04}", i - 0xF);
+ CResult.WriteLine ();
+ }
+ }
+
+ // level 1
+ for (int i = 0; i < cjk.Length; i++) {
+// if (i == max)
+// break;
+ byte value = (byte) (cjk [i] & 0xFF);
if (value < 10)
- Result.Write ("{0},", value);
+ CSResult.Write ("{0},", value);
else
- Result.Write ("0x{0:X04},", value);
+ CSResult.Write ("0x{0:X02},", value);
+ CResult.Write ("{0},", value);
#if Binary
binary.Write (value);
#endif
- if ((i & 0xF) == 0xF)
- Result.WriteLine ("// {0:X04}", i - 0xF + offset);
+ if ((i & 0xF) == 0xF) {
+ CSResult.WriteLine ("// {0:X04}", i - 0xF);
+ CResult.WriteLine ();
+ }
}
- Result.WriteLine ("};");
- Result.WriteLine ();
+
+ CResult.WriteLine ("0};");
+ CSResult.WriteLine ("};");
+ CSResult.WriteLine ();
#if Binary
- using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
+ using (FileStream fs = File.Create (String.Format ("../resources/collation.{0}.bin", name))) {
byte [] array = ms.ToArray ();
fs.Write (array, 0, array.Length);
}
void SerializeCJK (string name, byte [] cjk, int max)
{
- int offset = 0;//char.MaxValue - cjk.Length;
- Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
+ CResult.WriteLine ("static const guint8 collation_table_collation_cjk_{0} [] = {{", name);
+ CSResult.WriteLine ("static byte [] {0}Arr = new byte [] {{", name);
#if Binary
MemoryStream ms = new MemoryStream ();
BinaryWriter binary = new BinaryWriter (ms);
+ binary.Write (UUtil.ResourceVersion);
#endif
for (int i = 0; i < cjk.Length; i++) {
- if (i + offset == max)
+ if (i == max)
break;
byte value = cjk [i];
if (value < 10)
- Result.Write ("{0},", value);
+ CSResult.Write ("{0},", value);
else
- Result.Write ("0x{0:X02},", value);
+ CSResult.Write ("0x{0:X02},", value);
+ CResult.Write ("{0},", value);
#if Binary
binary.Write (value);
#endif
- if ((i & 0xF) == 0xF)
- Result.WriteLine ("// {0:X04}", i - 0xF + offset);
+ if ((i & 0xF) == 0xF) {
+ CSResult.WriteLine ("// {0:X04}", i - 0xF);
+ CResult.WriteLine ();
+ }
}
- Result.WriteLine ("};");
- Result.WriteLine ();
+ CResult.WriteLine ("0};");
+ CSResult.WriteLine ("};");
+ CSResult.WriteLine ();
#if Binary
- using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
+ using (FileStream fs = File.Create (String.Format ("../resources/collation.{0}.bin", name))) {
byte [] array = ms.ToArray ();
fs.Write (array, 0, array.Length);
}
{
Hashtable indexes = new Hashtable ();
Hashtable counts = new Hashtable ();
- Result.WriteLine ("static char [] tailorings = new char [] {");
+ CResult.WriteLine ("static const guint16 collation_table_tailoring [] = {");
+ CSResult.WriteLine ("static char [] tailoringArr = new char [] {");
int count = 0;
#if Binary
MemoryStream ms = new MemoryStream ();
BinaryWriter binary = new BinaryWriter (ms);
+ // Here we don't need to output resource version.
+ // This is cached.
#endif
foreach (Tailoring t in tailorings) {
if (t.Alias != 0)
continue;
- Result.Write ("/*{0}*/", t.LCID);
+ CResult.Write ("/*{0}*/", t.LCID);
+ CSResult.Write ("/*{0}*/", t.LCID);
indexes.Add (t.LCID, count);
char [] values = t.ItemToCharArray ();
counts.Add (t.LCID, values.Length);
foreach (char c in values) {
- Result.Write ("'\\x{0:X}', ", (int) c);
- if (++count % 16 == 0)
- Result.WriteLine (" // {0:X04}", count - 16);
+ CSResult.Write ("'\\x{0:X}', ", (int) c);
+ CResult.Write ("{0},", (int) c);
+ if (++count % 16 == 0) {
+ CSResult.WriteLine (" // {0:X04}", count - 16);
+ CResult.WriteLine ();
+ }
#if Binary
binary.Write ((ushort) c);
#endif
}
}
- Result.WriteLine ("};");
+ CResult.WriteLine ("0};");
+ CSResult.WriteLine ("};");
- Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
+ CResult.WriteLine ("static const guint32 collation_table_tailoring_infos [] = {");
+ CResult.WriteLine ("{0}, /*count*/", tailorings.Count);
+ CSResult.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
#if Binary
byte [] rawdata = ms.ToArray ();
ms = new MemoryStream ();
binary = new BinaryWriter (ms);
+ binary.Write (UUtil.ResourceVersion);
binary.Write (tailorings.Count);
#endif
foreach (Tailoring t in tailorings) {
foreach (Tailoring t2 in tailorings)
if (t2.LCID == t.LCID)
french = t2.FrenchSort;
- Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
+ CSResult.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
+ CResult.WriteLine ("{0},{1},{2},{3},", t.LCID, idx, cnt, french ? 1 : 0);
#if Binary
binary.Write (t.LCID);
binary.Write (idx);
binary.Write (french);
#endif
}
- Result.WriteLine ("};");
+ CResult.WriteLine ("0};");
+ CSResult.WriteLine ("};");
#if Binary
binary.Write ((byte) 0xFF);
binary.Write ((byte) 0xFF);
binary.Write (rawdata, 0, rawdata.Length);
- using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
+ using (FileStream fs = File.Create ("../resources/collation.tailoring.bin")) {
byte [] array = ms.ToArray ();
fs.Write (array, 0, array.Length);
}
{
StringBuilder sb = new StringBuilder ();
for (int i = 0; i < s.Length; i++) {
- if (s.StartsWith ("\\u")) {
- sb.Append ((char) int.Parse (
- s.Substring (2, 4), NumberStyles.HexNumber),
+ if (i + 5 < s.Length &&
+ s [i] == '\\' && s [i + 1] == 'u') {
+ sb.Append (
+ (char) int.Parse (
+ s.Substring (i + 2, 4),
+ NumberStyles.HexNumber),
1);
i += 5;
}
- else
- sb.Append (s [i]);
+ else
+ sb.Append (s [i]);
}
return sb.ToString ();
}
target = 'B';
else if (s.Substring (offset).StartsWith ("OPEN O"))
target = 'C';
+ else if (s.Substring (offset).StartsWith ("ETH"))
+ target = 'D';
else if (s.Substring (offset).StartsWith ("SCHWA"))
target = 'E';
- else if (s.Substring (offset).StartsWith ("ENG"))
- target = 'N';
else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
target = 'O';
else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
target = 'S';
else if (s.Substring (offset).StartsWith ("ESH"))
target = 'S';
+ else if (s.Substring (offset).StartsWith ("OUNCE"))
+ target = 'Z';
// For remaining IPA chars, direct mapping is
// much faster.
switch (cp) {
+ case 0x0166: case 0x0167:
+ // Though they are 'T', they have different weight
+ target = char.MinValue; break;
case 0x0299: target = 'B'; break;
case 0x029A: target = 'E'; break;
case 0x029B: target = 'G'; break;
"SOUTH WEST",
"LEFTWARDS",
"NORTH WEST",
+ "LEFT RIGHT",
+ "UP DOWN",
};
+ if (s.IndexOf ("RIGHTWARDS") >= 0 &&
+ s.IndexOf ("LEFTWARDS") >= 0)
+ value = 0xE1 - 0xD8;
+ else if (s.IndexOf ("UPWARDS") >= 0 &&
+ s.IndexOf ("DOWNWARDS") >= 0)
+ value = 0xE2 - 0xD8;
+ else if (s.IndexOf ("ARROW") >= 0 &&
+ s.IndexOf ("COMBINING") < 0 &&
+ s.IndexOf ("CLOCKWISE") >= 0)
+ value = s.IndexOf ("ANTICLOCKWISE") >= 0 ? 0xE4 - 0xD8 : 0xE3 - 0xD8;
if (value == 0)
for (int i = 1; value == 0 && i < arrowTargets.Length; i++)
if (s.IndexOf (arrowTargets [i]) > 0 &&
// Box names
if (0x2500 <= cp && cp < 0x2600) {
- int value = 0;
+ int value = int.MinValue;
// flags:
// up:1 down:2 right:4 left:8 vert:16 horiz:32
// [h,rl] [r] [l]
flag |= 32;
int fidx = flags.IndexOf (flag);
- value = fidx < 0 ? fidx : offsets [fidx];
+ if (fidx >= 0)
+ value = offsets [fidx];
} else if (s.IndexOf ("BLOCK") >= 0) {
if (s.IndexOf ("ONE EIGHTH") >= 0)
value = 0x12;
else
value = 0xC9 - 0xE5;
}
+ else if (s.IndexOf ("BULLET") >= 0)
+ value = 0xCC - 0xE5;
if (0x25DA <= cp && cp <= 0x25E5)
value = 0xCD + cp - 0x25DA - 0xE5;
case 0x2572: value = 0x10; break;
case 0x2573: value = 0x11; break;
}
- if (value != 0)
+ if (value != int.MinValue)
boxValues.Add (new DictionaryEntry (
cp, value));
}
// diacritical weights by character name
if (diacritics.Length != diacriticWeights.Length)
throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
- for (int d = 0; d < diacritics.Length; d++) {
+ for (int d = diacritics.Length - 1; d >= 0; d--) {
if (s.IndexOf (diacritics [d]) > 0) {
diacritical [cp] += diacriticWeights [d];
if (s.IndexOf ("COMBINING") >= 0)
diacritical [cp] -= (byte) 2;
- continue;
+ break;
}
// also process "COMBINING blah" here
// For now it is limited to cp < 0x0370
if (tmp.IndexOf ("WITH ") == 0)
tmp = tmp.Substring (4);
tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
- if (name == tmp)
+ if (name == tmp) {
diacritical [cp] = (byte) (diacriticWeights [d] - 2);
+ break;
+ }
//if (name == tmp)
//Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
}
if (s.IndexOf ("FULL STOP") > 0 &&
(s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
diacritical [cp] |= 0xF4;
+ if (s.StartsWith ("SCRIPT") || s.IndexOf (" SCRIPT ") > 0)
+ diacritical [cp] = (byte) (s.IndexOf ("SMALL") > 0 ? 3 :
+ s.IndexOf ("CAPITAL") > 0 ? 5 : 4);
// Arabic letter name
if (0x0621 <= cp && cp <= 0x064A &&
void ModifyUnidata ()
{
+ ArrayList decompValues = new ArrayList (this.decompValues);
+
+ // Hebrew uppercase letters.
+ foreach (int i in new int []
+ {0x05DB, 0x05DE, 0x05E0, 0x05E4, 0x05E6})
+ isUppercase [i] = true;
+
+
// Modify some decomposition equivalence
- decompType [0xFE31] = 0;
- decompIndex [0xFE31] = 0;
- decompLength [0xFE31] = 0;
- decompType [0xFE32] = 0;
- decompIndex [0xFE32] = 0;
- decompLength [0xFE32] = 0;
+ for (int i = 0xFE31; i <= 0xFE34; i++) {
+ decompType [i] = 0;
+ decompIndex [i] = 0;
+ decompLength [i] = 0;
+ }
+ decompType [0x037E] = 0;
+ decompIndex [0x037E] = 0;
+ decompLength [0x037E] = 0;
+ // Hangzhou numbers
+ for (int i = 0x3021; i <= 0x3029; i++)
+ diacritical [i] = 0x4E;
// Korean parens numbers
for (int i = 0x3200; i <= 0x321C; i++)
diacritical [i] = 0xA;
decompValues [decompIndex [0x3298]] = 0x52DE;
// LAMESPEC: custom remapping (which is not bugs but not fine, non-standard compliant things)
- decompIndex [0xFA0C] = decompIndex [0xF929]; // borrow U+F929 room (being empty)
- decompValues [decompIndex [0xFA0C]] = 0x5140;
+ decompIndex [0xFA0C] = decompValues.Count;
+ decompValues.Add ((int) 0x5140);
decompLength [0xFA0C] = 1;
decompIndex [0xF929] = decompLength [0xF929] = 0;
decompValues [decompIndex [0xF92C]] = 0x90DE;
+
+ decompIndex [0x2125] = decompValues.Count;
+ decompValues.Add ((int) 0x005A);
+ decompLength [0x2125] = 1;
+ decompType [0x2125] = DecompositionFont;
+
+ this.decompValues = decompValues.ToArray (typeof (int)) as int [];
}
void ModifyParsedValues ()
{
+ // Sometimes STROKE don't work fine
+ diacritical [0xD8] = diacritical [0xF8] = 0x21;
+ diacritical [0x141] = diacritical [0x142] = 0x1F;
+ // FIXME: why?
+ diacritical [0xAA] = diacritical [0xBA] = 3;
+ diacritical [0xD0] = diacritical [0xF0] = 0x68;
+ diacritical [0x131] = 3;
+ diacritical [0x138] = 3;
+ // TOPBAR does not work as an identifier for the weight
+ diacritical [0x182] = diacritical [0x183] = 0x68; // B
+ diacritical [0x18B] = diacritical [0x18C] = 0x1E; // D
+ // TONE TWO
+ diacritical [0x1A7] = diacritical [0x1A8] = 0x87;
+ // TONE SIX
+ diacritical [0x184] = diacritical [0x185] = 0x87;
+ // OPEN E
+ diacritical [0x190] = diacritical [0x25B] = 0x7B;
+ // There are many letters w/ diacritical weight 0x7B
+ diacritical [0x0192] = diacritical [0x0194] =
+ diacritical [0x0195] = diacritical [0x0196] =
+ diacritical [0x019C] = diacritical [0x019E] =
+ diacritical [0x01A6] = diacritical [0x01B1] =
+ diacritical [0x01B2] = diacritical [0x01BF] = 0x7B;
+ // ... as well as 0x7C
+ diacritical [0x01A2] = diacritical [0x01A3] = 0x7C;
+
+ // <font> NFKD characters seem to have diacritical
+ // weight as 3,4,5... but the order does not look
+ // by codepoint and I have no idea how they are sorted.
+ diacritical [0x210E] = 3;
+ diacritical [0x210F] = 0x68;
+ diacritical [0x2110] = 4;
+ diacritical [0x2111] = 5;
+ diacritical [0x2112] = 4;
+ diacritical [0x2113] = 4;
+ diacritical [0x211B] = 4;
+ diacritical [0x211C] = 5;
+
+ // some cyrillic diacritical weight. They seem to be
+ // based on old character names, so it's quicker to
+ // set them directly here.
+ // FIXME: they are by mostly unknown reason
+ diacritical [0x0496] = diacritical [0x0497] = 7;
+ diacritical [0x0498] = diacritical [0x0499] = 0x1A;
+ diacritical [0x049A] = diacritical [0x049B] = 0x17;
+ diacritical [0x049C] = diacritical [0x049D] = 9;
+ diacritical [0x049E] = diacritical [0x049F] = 4;
+ diacritical [0x04A0] = diacritical [0x04A1] = 0xA;
+ diacritical [0x04A2] = diacritical [0x04A3] = 7;
+ diacritical [0x04A4] = diacritical [0x04A5] = 8;
+ diacritical [0x04AA] = diacritical [0x04AB] = 0x1A; // ES CEDILLA?
+ diacritical [0x04AC] = diacritical [0x04AD] = 7; // RIGHT DESCENDER? but U+4B2
+ diacritical [0x04AE] = diacritical [0x04AF] = 0xB; // STRAIGHT U?
+ diacritical [0x04B2] = diacritical [0x04B3] = 0x17; // RIGHT DESCENDER? but U+4AC
+ diacritical [0x04B4] = diacritical [0x04B5] = 3;
+ diacritical [0x04B6] = 8;
+ diacritical [0x04B7] = 7;
+ diacritical [0x04B8] = diacritical [0x04B9] = 9;
+ diacritical [0x04BA] = diacritical [0x04BB] = 9;
+
// number, secondary weights
byte weight = 0x38;
int [] numarr = numberSecondaryWeightBounds;
if (Char.IsNumber ((char) cp))
diacritical [cp] = weight;
+ // Gurmukhi special letters' diacritical weight
+ for (int i = 0x0A50; i < 0x0A60; i++)
+ diacritical [i] = 4;
+ // Oriya special letters' diacritical weight
+ for (int i = 0x0B5C; i < 0x0B60; i++)
+ diacritical [i] = 6;
+
// Update name part of named characters
for (int i = 0; i < sortableCharNames.Count; i++) {
DictionaryEntry de =
#region Specially ignored // 01
// This will raise "Defined" flag up.
+ // FIXME: Check If it is really fine. Actually for
+ // Japanese voice marks this code does remapping.
foreach (char c in specialIgnore)
map [(int) c] = new CharMapEntry (0, 0, 0);
#endregion
+ #region Extenders (FF FF)
+ fillIndex [0xFF] = 0xFF;
+ char [] specialBiggest = new char [] {
+ '\u3005', '\u3031', '\u3032', '\u309D',
+ '\u309E', '\u30FC', '\u30FD', '\u30FE',
+ '\uFE7C', '\uFE7D', '\uFF70'};
+ foreach (char c in specialBiggest)
+ AddCharMap (c, 0xFF, 0);
+ #endregion
#region Variable weights
// Controls : 06 03 - 06 3D
- fillIndex [6] = 3;
+ fillIndex [0x6] = 3;
for (int i = 0; i < 65536; i++) {
if (IsIgnorable (i))
continue;
}
// Apostrophe 06 80
- fillIndex [6] = 0x80;
- AddCharMapGroup ('\'', 6, 1, 0);
+ fillIndex [0x6] = 0x80;
+ AddCharMap ('\'', 6, 0);
+ AddCharMap ('\uFF07', 6, 1);
AddCharMap ('\uFE63', 6, 1);
+ // SPECIAL CASE: fill FE32 here in prior to be added
+ // at 2013. Windows does not always respect NFKD.
+ map [0xFE32] = new CharMapEntry (6, 0x90, 0);
+
// Hyphen/Dash : 06 81 - 06 90
for (int i = 0; i < char.MaxValue; i++) {
if (!IsIgnorable (i) &&
}
}
}
+ // They are regarded as primarily equivalent to '-'
+ map [0x208B] = new CharMapEntry (6, 0x82, 0);
+ map [0x207B] = new CharMapEntry (6, 0x82, 0);
+ map [0xFF0D] = new CharMapEntry (6, 0x82, 0);
// Arabic variable weight chars 06 A0 -
fillIndex [6] = 0xA0;
// vowels
for (int i = 0x64B; i <= 0x650; i++)
- AddArabicCharMap ((char) i);
+ AddArabicCharMap ((char) i, 6, 1, 0);
// sukun
AddCharMapGroup ('\u0652', 6, 1, 0);
// shadda
for (int i = 0x0329; i <= 0x0334; i++)
if (!IsIgnorable (i))
AddCharMap ((char) i, 0x1, 1);
+ fillIndex [0x1]++;
for (int i = 0x0339; i <= 0x0341; i++)
if (!IsIgnorable (i))
AddCharMap ((char) i, 0x1, 1);
- fillIndex [0x1] = 0x72;
+ fillIndex [0x1] = 0x74;
for (int i = 0x0346; i <= 0x0348; i++)
if (!IsIgnorable (i))
AddCharMap ((char) i, 0x1, 1);
for (int i = 0x02CE; i <= 0x02CF; i++)
if (!IsIgnorable (i))
AddCharMap ((char) i, 0x1, 1);
+ fillIndex [0x1]++;
for (int i = 0x02D1; i <= 0x02D3; i++)
if (!IsIgnorable (i))
AddCharMap ((char) i, 0x1, 1);
if (!IsIgnorable (i))
AddCharMap ((char) i, 0x1, 1);
+
// FIXME: needs more love here (it should eliminate
// all the hacky code above).
for (int i = 0x0300; i < 0x0370; i++)
if (!IsIgnorable (i) && diacritical [i] != 0
- /* especiall here*/ && !map [i].Defined)
+ && !map [i].Defined)
map [i] = new CharMapEntry (
0x1, 0x1, diacritical [i]);
// a few more characters (that however results in
// overflow of level 2 unless we start before 0xDD).
fillIndex [0x1] = 0xDD;
- for (int i = 0x20d0; i <= 0x20e1; i++)
+ for (int i = 0x20D0; i <= 0x20DC; i++)
+ AddCharMap ((char) i, 0x1, 1);
+ fillIndex [0x1] = 0xEC;
+ for (int i = 0x20DD; i <= 0x20E1; i++)
+ AddCharMap ((char) i, 0x1, 1);
+ fillIndex [0x1] = 0x4;
+ AddCharMap ('\u0CD5', 0x1, 1);
+ AddCharMap ('\u0CD6', 0x1, 1);
+ AddCharMap ('\u093C', 0x1, 1);
+ for (int i = 0x302A; i <= 0x302D; i++)
+ AddCharMap ((char) i, 0x1, 1);
+ AddCharMap ('\u0C55', 0x1, 1);
+ AddCharMap ('\u0C56', 0x1, 1);
+
+ fillIndex [0x1] = 0x50; // I wonder how they are sorted
+ for (int i = 0x02D4; i <= 0x02D7; i++)
AddCharMap ((char) i, 0x1, 1);
// They are not part of Nonspacing marks, but have
for (int i = 0x30FC; i <= 0x30FE; i++)
map [i] = new CharMapEntry (0xFF, 0xFF, 1);
+ fillIndex [0x1] = 0xA;
+ for (int i = 0x0951; i <= 0x0954; i++)
+ AddCharMap ((char) i, 0x1, 2);
+
#endregion
// while they aren't.
AddCharMap ('\u2422', 0x7, 1, 0); // blank symbol
AddCharMap ('\u2423', 0x7, 1, 0); // open box
+
#endregion
// category 09 - continued symbols from 08
AddCharMap ((char) cp, 0x9, 1, 0);
// arrows
- byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3};
+ byte [] arrowLv2 = new byte [] {0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3};
foreach (DictionaryEntry de in arrowValues) {
int idx = (int) de.Value;
int cp = (int) de.Key;
}
// boxes
byte [] boxLv2 = new byte [128];
+ // 0-63 will be used for those offsets are positive,
+ // and 64-127 are for negative ones.
for (int i = 0; i < boxLv2.Length; i++)
boxLv2 [i] = 3;
foreach (DictionaryEntry de in boxValues) {
continue;
if (off < 0) {
fillIndex [0x9] = (byte) (0xE5 + off);
- AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [-off]++);
+ AddCharMapGroup ((char) cp, 0x9, 0, boxLv2 [128 + off]++);
}
else {
fillIndex [0x9] = (byte) (0xE5 + off);
uc = Char.GetUnicodeCategory ((char) cp);
if (!IsIgnorable (cp) &&
uc == UnicodeCategory.CurrencySymbol &&
- cp != '$' ||
- cp == 0xAC)
+ cp != '$')
AddCharMapGroup ((char) cp, 0xA, 1, 0);
}
// byte other symbols
uc = Char.GetUnicodeCategory ((char) cp);
if (!IsIgnorable (cp) &&
uc == UnicodeCategory.OtherSymbol ||
- cp == '\u00B5' || cp == '\u00B7')
+ cp == '\u00AC' || cp == '\u00B5' || cp == '\u00B7')
AddCharMapGroup ((char) cp, 0xA, 1, 0);
}
+ // U+30FB here
+ AddCharMapGroup ('\u30FB', 0xA, 1, 0);
- fillIndex [0xA] = 0x0F; // FIXME: it won't be needed
for (int cp = 0x2020; cp <= 0x2031; cp++)
if (Char.IsPunctuation ((char) cp))
AddCharMap ((char) cp, 0xA, 1, 0);
for (int cp = 0x20A0; cp <= 0x20AB; cp++)
AddCharMap ((char) cp, 0xA, 1, 0);
- fillIndex [0xA] = 0x2F; // FIXME: it won't be needed
+
+ // 3004 is skipped at first...
+ for (int cp = 0x3010; cp <= 0x3040; cp++)
+ if (Char.IsSymbol ((char) cp))
+ AddCharMap ((char) cp, 0xA, 1, 0);
+ // SPECIAL CASES: added here
+ AddCharMap ('\u3004', 0xA, 1, 0);
+ AddCharMap ('\u327F', 0xA, 1, 0);
+
for (int cp = 0x2600; cp <= 0x2613; cp++)
AddCharMap ((char) cp, 0xA, 1, 0);
// Dingbats
for (int i = 0x2440; i < 0x2460; i++)
AddCharMap ((char) i, 0xA, 1, 0);
+ // SPECIAL CASES: why?
+ AddCharMap ('\u0E3F', 0xA, 1, 0);
+ AddCharMap ('\u2117', 0xA, 1, 0);
+ AddCharMap ('\u20AC', 0xA, 1, 0);
#endregion
#region Numbers // 0C 02 - 0C E1
fillIndex [0xC] = 2;
// 9F8 : Bengali "one less than the denominator"
- AddCharMap ('\u09F8', 0xC, 1);
+ AddCharMap ('\u09F8', 0xC, 1, 0x3C);
ArrayList numbers = new ArrayList ();
for (int i = 0; i < 65536; i++)
ArrayList numberValues = new ArrayList ();
foreach (int i in numbers)
numberValues.Add (new DictionaryEntry (i, decimalValue [(char) i]));
+ // SPECIAL CASE: Cyrillic Thousand sign
+ numberValues.Add (new DictionaryEntry (0x0482, 1000m));
numberValues.Sort (DecimalDictionaryValueComparer.Instance);
//foreach (DictionaryEntry de in numberValues)
//Console.Error.WriteLine ("****** number {0:X04} : {1} {2}", de.Key, de.Value, decompType [(int) de.Key]);
+ // FIXME: fillIndex adjustment lines are too
+ // complicated. It must be simpler.
decimal prevValue = -1;
foreach (DictionaryEntry de in numberValues) {
int cp = (int) de.Key;
fillIndex [0xC]++;
int xcp;
- if (currValue <= 10) {
- xcp = (int) prevValue + 0x2170 - 1;
- AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
+ if (currValue <= 13) {
+ if (currValue == 4)
+ fillIndex [0xC]++;
+ // SPECIAL CASE
+ if (currValue == 11)
+ AddCharMap ('\u0BF0', 0xC, 1);
xcp = (int) prevValue + 0x2160 - 1;
AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
- fillIndex [0xC] += 2;
- xcp = (int) prevValue + 0x3021 - 1;
+ xcp = (int) prevValue + 0x2170 - 1;
AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
fillIndex [0xC]++;
}
- else if (currValue == 11)
+ if (currValue < 12)
+ fillIndex [0xC]++;
+ if (currValue <= 10) {
+ xcp = (int) prevValue + 0x3021 - 1;
+ AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
fillIndex [0xC]++;
+ }
}
if (prevValue < currValue)
prevValue = currValue;
continue;
// HangZhou and Roman are add later
// (code is above)
- else if (0x3021 <= cp && cp < 0x302A
- || 0x2160 <= cp && cp < 0x216A
- || 0x2170 <= cp && cp < 0x217A)
+ if (0x3021 <= cp && cp < 0x302A
+ || 0x2160 <= cp && cp < 0x216C
+ || 0x2170 <= cp && cp < 0x217C)
continue;
- if (cp == 0x215B) // FIXME: why?
+ if (cp == 0x215B) // FIXME: why?
fillIndex [0xC] += 2;
else if (cp == 0x3021) // FIXME: why?
fillIndex [0xC]++;
- AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
if (addnew || cp <= '9') {
int mod = (int) currValue - 1;
int xcp;
- if (1 <= currValue && currValue <= 10) {
+ if (1 <= currValue && currValue <= 11) {
xcp = mod + 0x2776;
AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
xcp = mod + 0x2780;
AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
}
}
+ if (addnew && currValue >= 10 && currValue < 13 || cp == 0x09F9)
+ fillIndex [0xC]++;
+ AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp], true);
- if (cp != 0x09E7 && cp != 0x09EA)
+ switch (cp) {
+ // Maybe Bengali digit numbers do not increase
+ // indexes, but 0x09E6 does.
+ case 0x09E7: case 0x09E8: case 0x09E9:
+ case 0x09EA:
+ // SPECIAL CASES
+ case 0x0BF0: case 0x2180: case 0x2181:
+ break;
+ // SPECIAL CASE
+ case 0x0BF1:
fillIndex [0xC]++;
+ break;
+ default:
+ if (currValue < 11 || currValue == 1000)
+ fillIndex [0xC]++;
+ break;
+ }
// Add special cases that are not regarded as
// numbers in UnicodeCategory speak.
AddCharMapGroup ('\u01BD', 0xC, 0, 0);
AddCharMapGroup ('\u01BC', 0xC, 1, 0);
}
- else if (cp == '6') // FIXME: why?
+ else if (cp == '2' || cp == '6') // FIXME: why?
fillIndex [0xC]++;
}
AddCharMapGroup ((char) i, 0xE, 1, 0);
}
- // Greek and Coptic
- fillIndex [0xF] = 02;
- for (int i = 0x0380; i < 0x0390; i++)
+ // IPA extensions
+ // FIXME: this results in not equivalent values to
+ // Windows, but is safer for comparison.
+ char [] ipaArray = new char [0x300 - 0x250 + 0x20];
+ for (int i = 0x40; i < 0x60; i++)
if (Char.IsLetter ((char) i))
- AddLetterMap ((char) i, 0xF, 1);
- fillIndex [0xF] = 02;
- for (int i = 0x0391; i < 0x03CF; i++)
+ ipaArray [i - 0x40] = (char) (i);
+ for (int i = 0x250; i < 0x300; i++)
if (Char.IsLetter ((char) i))
- AddLetterMap ((char) i, 0xF, 1);
+ ipaArray [i - 0x250 + 0x20] = (char) i;
+ Array.Sort (ipaArray, UCAComparer.Instance);
+ int targetASCII = 0;
+ byte latinDiacritical = 0x7B;
+ foreach (char c in ipaArray) {
+ if (c <= 'Z') {
+ targetASCII = c;
+ latinDiacritical = 0x7B;
+ }
+ else
+ map [(int) c] = new CharMapEntry (
+ 0xE,
+ map [targetASCII].Level1,
+ latinDiacritical++);
+ }
+
+ // Greek and Coptic
+
+ // FIXME: this is (mysterious and) incomplete.
+ for (int i = 0x0380; i < 0x0400; i++)
+ if (diacritical [i] == 0 &&
+ decompLength [i] == 1 &&
+ decompType [i] == DecompositionCompat)
+ diacritical [i] = 3;
+
+ fillIndex [0xF] = 2;
+ for (int i = 0x0391; i < 0x03AA; i++)
+ if (i != 0x03A2)
+ AddCharMap ((char) i, 0xF, 1,
+ diacritical [i]);
+ fillIndex [0xF] = 2;
+ for (int i = 0x03B1; i < 0x03CA; i++)
+ if (i != 0x03C2)
+ AddCharMap ((char) i, 0xF, 1,
+ diacritical [i]);
+ // Final Sigma
+ map [0x03C2] = new CharMapEntry (0xF,
+ map [0x03C3].Level1, map [0x03C3].Level2);
+
fillIndex [0xF] = 0x40;
- for (int i = 0x03D0; i < 0x0400; i++)
- if (Char.IsLetter ((char) i))
- AddLetterMap ((char) i, 0xF, 1);
+ for (int i = 0x03DA; i < 0x03F0; i++)
+ AddCharMap ((char) i, 0xF,
+ (byte) (i % 2 == 0 ? 0 : 2),
+ diacritical [i]);
+
+ // NFKD
+ for (int i = 0x0386; i <= 0x0400; i++)
+ FillLetterNFKD (i, true, true);
// Cyrillic.
// Cyrillic letters are sorted like Latin letters i.e.
}
}
+ // NFKD
+ for (int i = 0x0401; i <= 0x045F; i++)
+ FillLetterNFKD (i, false, false);
+
for (int i = 0; i < cymap_src.Length; i++) {
char c = cymap_src [i];
fillIndex [0x10] = map [c].Level1;
- AddLetterMap ((char) (0x0490 + i * 2),
- 0x10, 0);
+ int c2 = 0x0490 + i * 2;
+ AddLetterMapCore ((char) c2, 0x10, 0, diacritical [c2], false);
}
// Armenian
fillIndex [0x11] = 0x3;
- for (int i = 0x0531; i < 0x0586; i++)
+ fillIndex [0x1] = 0x98;
+ for (int i = 0x0531; i < 0x0586; i++) {
+ if (i == 0x0559 || i == 0x55A)
+ AddCharMap ((char) i, 1, 1);
if (Char.IsLetter ((char) i))
AddLetterMap ((char) i, 0x11, 1);
+ }
// Hebrew
// -Letters
fillIndex [0x12] = 0x2;
for (int i = 0x05D0; i < 0x05FF; i++)
- if (Char.IsLetter ((char) i))
- AddLetterMap ((char) i, 0x12, 1);
+ if (Char.IsLetter ((char) i)) {
+ if (isUppercase [i]) {
+ fillIndex [0x12]--;
+ AddLetterMap ((char) i, 0x12, 2);
+ }
+ else
+ AddLetterMap ((char) i, 0x12, 1);
+ }
// -Accents
fillIndex [0x1] = 0x3;
for (int i = 0x0591; i <= 0x05C2; i++) {
case 0x0649: formDiacritical = 5; break;
case 0x064A: formDiacritical = 7; break;
}
- AddLetterMapCore ((char) i, 0x13, 1, formDiacritical);
+// AddLetterMapCore ((char) i, 0x13, 1, formDiacritical, false);
+ AddArabicCharMap ((char) i, 0x13, 1, formDiacritical);
}
+ for (int i = 0x0670; i < 0x0673; i++)
+ map [i] = new CharMapEntry (0x13, 0xB, (byte) (0xC + i - 0x670));
fillIndex [0x13] = 0x84;
for (int i = 0x0674; i < 0x06D6; i++)
if (Char.IsLetter ((char) i))
- AddLetterMap ((char) i, 0x13, 1);
+ AddLetterMapCore ((char) i, 0x13, 1, 0, false);
// Devanagari
+
+ // FIXME: this could be fixed in more decent way
+ for (int i = 0x0958; i <= 0x095F; i++)
+ diacritical [i] = 8;
+
// FIXME: it does seem straight codepoint mapping.
fillIndex [0x14] = 04;
for (int i = 0x0901; i < 0x0905; i++)
if (c == '\u0A3C' || c == '\u0A4D' ||
'\u0A66' <= c && c <= '\u0A71')
continue;
- // SPECIAL CASE: U+A38 = U+A36 at primary level (why?)
+ // SPECIAL CASES
byte shift = 4;
- if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E')
+ switch (c) {
+ case '\u0A33': case '\u0A36': case '\u0A16':
+ case '\u0A17': case '\u0A5B': case '\u0A5E':
shift = 0;
+ break;
+ }
+ if (c == '\u0A3E') // Skip
+ fillIndex [0x16] = 0xC0;
AddLetterMap (c, 0x16, shift);
}
AddLetterMap ((char) i, 0x1, 1);
continue;
}
- AddLetterMap ((char) i, 0x18, 1);
+ AddLetterMapCore ((char) i, 0x18, 1, 0, true);
}
// Tamil
+ "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
+ "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
+ "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
- + "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
+ + "<{\u1141 \u114C}, \u3180=\u11EE, \u11EC, \u11ED,,,,, "
+ "\u11F1,, \u11F2,,,"
- + "\u11EF,,, \u11F0, \u110C=\u11BD,, >"
+ + "\u11EF,,, \u3181=\u11F0, \u110C=\u11BD,, >"
+ "<\u114D, \u110D,, >"
+ "<{\u114E \u1151},, \u110E=\u11BE,, >"
+ "<{\u1152 \u1155},,, \u110F=\u11BF >"
// PrivateUse ... computed.
// remaining Surrogate ... computed.
- #region Special "biggest" area (FF FF)
- fillIndex [0xFF] = 0xFF;
- char [] specialBiggest = new char [] {
- '\u3005', '\u3031', '\u3032', '\u309D',
- '\u309E', '\u30FC', '\u30FD', '\u30FE',
- '\uFE7C', '\uFE7D', '\uFF70'};
- foreach (char c in specialBiggest)
- AddCharMap (c, 0xFF, 0);
- #endregion
-
#region 07 - ASCII non-alphanumeric + 3001, 3002 // 07
// non-alphanumeric ASCII except for: + - < = > '
for (int i = 0x21; i < 0x7F; i++) {
+ // SPECIAL CASE: 02C6 looks regarded as
+ // equivalent to '^', which does not conform
+ // to Unicode standard character database.
+ if (i == 0x005B)
+ AddCharMap ('\u2045', 0x7, 0, 0x1C);
+ if (i == 0x005D)
+ AddCharMap ('\u2046', 0x7, 0, 0x1C);
+ if (i == 0x005E)
+ AddCharMap ('\u02C6', 0x7, 0, 3);
+ if (i == 0x0060)
+ AddCharMap ('\u02CB', 0x7, 0, 3);
+
if (Char.IsLetterOrDigit ((char) i)
|| "+-<=>'".IndexOf ((char) i) >= 0)
continue; // they are not added here.
- AddCharMapGroup2 ((char) i, 0x7, 1, 0);
+
+ AddCharMapGroup2 ((char) i, 0x7, 1, 0);
// Insert 3001 after ',' and 3002 after '.'
if (i == 0x2C)
AddCharMapGroup2 ('\u3001', 0x7, 1, 0);
// FIXME: actually those reset should not be
// done but here I put for easy goal.
+ if (i == 0x05C3)
+ fillIndex [0x7]++;
if (i == 0x0700)
fillIndex [0x7] = 0xE2;
if (i == 0x2016)
fillIndex [0x7] = 0x77;
+ if (i == 0x3008)
+ fillIndex [0x7] = 0x93;
+
+ if (0x02C8 <= i && i <= 0x02CD)
+ continue; // nonspacing marks
+
+ // SPECIAL CASE: maybe they could be allocated
+ // dummy NFKD mapping and no special processing
+ // would be required here.
+ if (i == 0x00AF)
+ AddCharMap ('\u02C9', 0x7, 0, 3);
+ if (i == 0x00B4)
+ AddCharMap ('\u02CA', 0x7, 0, 3);
+ if (i == 0x02C7)
+ AddCharMap ('\u02D8', 0x7, 0, 3);
// SPECIAL CASES:
switch (i) {
case 0xAB: // 08
case 0xB7: // 0A
case 0xBB: // 08
+ case 0x02B9: // 01
+ case 0x02BA: // 01
case 0x2329: // 09
case 0x232A: // 09
continue;
case UnicodeCategory.OtherPunctuation:
case UnicodeCategory.ClosePunctuation:
case UnicodeCategory.OpenPunctuation:
+ case UnicodeCategory.ConnectorPunctuation:
case UnicodeCategory.InitialQuotePunctuation:
case UnicodeCategory.FinalQuotePunctuation:
case UnicodeCategory.ModifierSymbol:
// SPECIAL CASES: // 0xA
if (0x2020 <= i && i <= 0x2031)
continue;
- AddCharMapGroup ((char) i, 0x7, 1, 0);
+ if (i == 0x3003) // added later
+ continue;
+ AddCharMapGroup2 ((char) i, 0x7, 1, 0);
break;
default:
- if (i == 0xA6) // SPECIAL CASE. FIXME: why?
+ if (i == 0xA6 || i == 0x1C3 || i == 0x037A) // SPECIAL CASE. FIXME: why?
goto case UnicodeCategory.OtherPunctuation;
break;
}
}
+
// Control pictures
// FIXME: it should not need to reset level 1, but
// it's for easy goal.
fillIndex [0x7] = 0xB6;
- for (int i = 0x2400; i <= 0x2421; i++)
+ for (int i = 0x2400; i <= 0x2424; i++)
AddCharMap ((char) i, 0x7, 1, 0);
+
+ // FIXME: what are they?
+ AddCharMap ('\u3003', 0x7, 1);
+ AddCharMap ('\u3006', 0x7, 1);
+ AddCharMap ('\u02D0', 0x7, 1);
+ AddCharMap ('\u10FB', 0x7, 1);
+ AddCharMap ('\u0950', 0x7, 1);
+ AddCharMap ('\u093D', 0x7, 1);
+ AddCharMap ('\u0964', 0x7, 1);
+ AddCharMap ('\u0965', 0x7, 1);
+ AddCharMap ('\u0970', 0x7, 1);
+
+ #endregion
+
+ #region category 08 - symbols
+ fillIndex [0x8] = 2;
+ // Here Windows mapping is not straightforward. It is
+ // not based on computation but seems manual sorting.
+ AddCharMapGroup ('+', 0x8, 1, 0); // plus
+ AddCharMapGroup ('\u2212', 0x8, 1); // minus
+ AddCharMapGroup ('\u229D', 0x8, 1); // minus
+ AddCharMapGroup ('\u2297', 0x8, 1); // mul
+ AddCharMapGroup ('\u2044', 0x8, 1); // div
+ AddCharMapGroup ('\u2215', 0x8, 0); // div
+ AddCharMapGroup ('\u2298', 0x8, 1); // div slash
+ AddCharMapGroup ('\u2217', 0x8, 0); // mul
+ AddCharMapGroup ('\u229B', 0x8, 1); // asterisk oper
+ AddCharMapGroup ('\u2218', 0x8, 0); // ring
+ AddCharMapGroup ('\u229A', 0x8, 1); // ring
+ AddCharMapGroup ('\u2219', 0x8, 0); // bullet
+ AddCharMapGroup ('\u2299', 0x8, 1); // dot oper
+ AddCharMapGroup ('\u2213', 0x8, 1); // minus-or-plus
+ AddCharMapGroup ('\u003C', 0x8, 1); // <
+ AddCharMapGroup ('\u227A', 0x8, 1); // precedes relation
+ AddCharMapGroup ('\u22B0', 0x8, 1); // precedes under relation
+
+ for (int cp = 0; cp < 0x2300; cp++) {
+ if (cp == 0xAC) // SPECIAL CASE: skip
+ continue;
+ if (cp == 0x200) {
+ cp = 0x2200; // skip to 2200
+ fillIndex [0x8] = 0x21;
+ }
+ if (cp == 0x2295)
+ fillIndex [0x8] = 0x3;
+ if (cp == 0x22A2)
+ fillIndex [0x8] = 0xAB;
+ if (cp == 0x22B2)
+ fillIndex [0x8] = 0xB9;
+ if (!map [cp].Defined &&
+// Char.GetUnicodeCategory ((char) cp) ==
+// UnicodeCategory.MathSymbol)
+ Char.IsSymbol ((char) cp))
+ AddCharMapGroup ((char) cp, 0x8, 1);
+ // SPECIAL CASES: no idea why Windows sorts as such
+ switch (cp) {
+ case 0x3E:
+ AddCharMap ('\u227B', 0x8, 1, 0);
+ AddCharMap ('\u22B1', 0x8, 1, 0);
+ break;
+ case 0xB1:
+ AddCharMapGroup ('\u00AB', 0x8, 1);
+ AddCharMapGroup ('\u226A', 0x8, 1);
+ AddCharMapGroup ('\u00BB', 0x8, 1);
+ AddCharMapGroup ('\u226B', 0x8, 1);
+ break;
+ case 0xF7:
+ AddCharMap ('\u01C0', 0x8, 1, 0);
+ AddCharMap ('\u01C1', 0x8, 1, 0);
+ AddCharMap ('\u01C2', 0x8, 1, 0);
+ break;
+ }
+ }
#endregion
- // FIXME: for 07 xx we need more love.
+ #region Hack!
// Characters w/ diacritical marks (NFKD)
for (int i = 0; i <= char.MaxValue; i++) {
int start = decompIndex [i];
int primaryChar = decompValues [start];
- int secondary = 0;
+ int secondary = diacritical [i];
bool skip = false;
int length = decompLength [i];
// special processing for parenthesized ones.
}
- // category 08 - symbols
- fillIndex [0x8] = 2;
- // Here Windows mapping is not straightforward. It is
- // not based on computation but seems manual sorting.
- AddCharMapGroup ('+', 0x8, 1, 0); // plus
- AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
- AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
- AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
- AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
- AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
- AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
- AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
- AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
- AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
- AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
- AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
- AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
-
- for (int cp = 0; cp < 0x2300; cp++) {
- if (cp == 0xAC) // SPECIAL CASE: skip
- continue;
- if (cp == 0x200) {
- cp = 0x2200; // skip to 2200
- fillIndex [0x8] = 0x21;
- }
- if (cp == 0x2295)
- fillIndex [0x8] = 0x3;
- if (cp == 0x22B2)
- fillIndex [0x8] = 0xB9;
- if (!map [cp].Defined &&
-// Char.GetUnicodeCategory ((char) cp) ==
-// UnicodeCategory.MathSymbol)
- Char.IsSymbol ((char) cp))
- AddCharMapGroup ((char) cp, 0x8, 1, diacritical [cp]);
- // SPECIAL CASES: no idea why Windows sorts as such
- switch (cp) {
- case 0x3E:
- AddCharMap ('\u227B', 0x8, 1, 0);
- AddCharMap ('\u22B1', 0x8, 1, 0);
- break;
- case 0xB1:
- AddCharMapGroup ('\u00AB', 0x8, 1, 0);
- AddCharMapGroup ('\u226A', 0x8, 1, 0);
- AddCharMapGroup ('\u00BB', 0x8, 1, 0);
- AddCharMapGroup ('\u226B', 0x8, 1, 0);
- break;
- case 0xF7:
- AddCharMap ('\u01C0', 0x8, 1, 0);
- AddCharMap ('\u01C1', 0x8, 1, 0);
- AddCharMap ('\u01C2', 0x8, 1, 0);
- break;
- }
- }
+ // Diacritical weight adjustment
- #region Level2 adjustment
// Arabic Hamzah
diacritical [0x624] = 0x5;
diacritical [0x626] = 0x7;
mod = diacritical [i];
break;
case 0x13: // Arabic
+ if (i == 0x0621)
+ break; // 0
+ if (diacritical [i] == 0 && decompLength [i] != 0)
+ diacritical [i] = map [decompValues [decompIndex [i]]].Level2;
if (diacritical [i] == 0 && i >= 0xFE8D)
mod = 0x8; // default for arabic
break;
map [i] = new CharMapEntry (
cat, map [i].Level1, mod);
}
- #endregion
- // FIXME: this is hack but those NonSpacingMark
+ // FIXME: this is halfly hack but those NonSpacingMark
// characters and still undefined are likely to
// be nonspacing.
- for (int i = 0; i < char.MaxValue; i++)
- if (!map [i].Defined &&
- !IsIgnorable (i) &&
- Char.GetUnicodeCategory ((char) i) ==
+ for (int i = 0; i < char.MaxValue; i++) {
+ if (map [i].Defined ||
+ IsIgnorable (i))
+ continue;
+ switch (i) {
+ // SPECIAL CASES.
+ case 0x02B9:
+ case 0x02BA:
+ break;
+ default:
+ if (Char.GetUnicodeCategory ((char) i) !=
UnicodeCategory.NonSpacingMark)
+ continue;
+ break;
+ }
+ if (diacritical [i] != 0)
+ map [i] = new CharMapEntry (1, 1, diacritical [i]);
+ else
AddCharMap ((char) i, 1, 1);
+ }
- // FIXME: this is hack but those Symbol characters
- // are likely to fall into 0xA category.
- for (int i = 0; i < char.MaxValue; i++)
- if (!map [i].Defined &&
- !IsIgnorable (i) &&
- Char.IsSymbol ((char) i))
- AddCharMap ((char) i, 0xA, 1);
+ #endregion
+ }
+
+ TextInfo ti = CultureInfo.InvariantCulture.TextInfo;
+
+ private void FillLetterNFKD (int i, bool checkUpper, bool greekRemap)
+ {
+ if (map [i].Defined)
+ return;
+ int up = (int) ti.ToUpper ((char) i);
+ if (checkUpper && map [up].Category == 0xF) {
+ if (i == up)
+ return;
+ FillLetterNFKD (up, checkUpper, greekRemap);
+ map [i] = new CharMapEntry (0xF,
+ map [up].Level1,
+ map [up].Level2);
+ } else {
+ int idx = decompIndex [i];
+ if (idx == 0)
+ return;
+ int primary = decompValues [decompIndex [i]];
+ FillLetterNFKD (primary, checkUpper, greekRemap);
+
+ int lv2 = map [primary].Level2;
+ byte off = 0;
+ for (int l = 1; l < decompLength [i]; l++) {
+ int tmp = decompValues [idx + l];
+ if (map [tmp].Category != 1)
+ return;
+ if (greekRemap && map [tmp].Level2 == 0xC)
+ off += 3;
+ else
+ off += map [tmp].Level2;
+ }
+ if (off > 0) {
+ if (lv2 == 0)
+ lv2 += 2;
+ lv2 += off;
+ }
+ // ... but override if the value already exists.
+ if (diacritical [i] != 0)
+ lv2 = diacritical [i];
+ map [i] = new CharMapEntry (
+ map [primary].Category,
+ map [primary].Level1,
+ (byte) lv2);
+ }
}
private void IncrementSequentialIndex (ref byte hangulCat)
char c = (char) (i + b);
byte arg = (byte) (b > 0 ? b + 2 : 0);
// Hiragana
- AddLetterMapCore (c, 0x22, 0, arg);
+ AddLetterMapCore (c, 0x22, 0, arg, false);
// Katakana
- AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg);
+ AddLetterMapCore ((char) (c + 0x60), 0x22, 0, arg, false);
}
}
private void AddLetterMap (char c, byte category, byte updateCount)
{
- AddLetterMapCore (c, category, updateCount, 0);
+ AddLetterMapCore (c, category, updateCount, 0, true);
}
- private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2)
+ private void AddLetterMapCore (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
{
char c2;
// <small> updates index
c2 = ToSmallForm (c);
if (c2 != c)
- AddCharMapGroup (c2, category, updateCount, level2);
+ AddCharMapGroup (c2, category, updateCount, level2, deferLevel2);
c2 = Char.ToLower (c, CultureInfo.InvariantCulture);
if (c2 != c && !map [(int) c2].Defined)
- AddLetterMapCore (c2, category, 0, level2);
+ AddLetterMapCore (c2, category, 0, level2, deferLevel2);
bool doUpdate = true;
if (IsIgnorable ((int) c) || map [(int) c].Defined)
doUpdate = false;
else
- AddCharMapGroup (c, category, 0, level2);
+ AddCharMapGroup (c, category, 0, level2, deferLevel2);
if (doUpdate)
fillIndex [category] += updateCount;
}
DecompositionWide,
DecompositionNarrow,
};
+ private void AddCharMapGroup (char c, byte category, byte updateCount)
+ {
+ AddCharMapGroup (c, category, updateCount, 0, true);
+ }
+
private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2)
+ {
+ AddCharMapGroup (c, category, updateCount, level2, false);
+ }
+
+ private void AddCharMapGroup (char c, byte category, byte updateCount, byte level2, bool deferLevel2)
{
if (map [(int) c].Defined)
return;
+ if (deferLevel2)
+ level2 = diacritical [(int) c];
+
char small = char.MinValue;
char vertical = char.MinValue;
Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
}
// <small> updates index
- if (small != char.MinValue)
- AddCharMap (small, category, updateCount);
+ if (small != char.MinValue) {
+ if (level2 == 0 && deferLevel2)
+ level2 = diacritical [small];
+ AddCharMap (small, category, updateCount, level2);
+ }
// itself
AddCharMap (c, category, 0, level2);
if (nfkd != null) {
foreach (int weight in sameWeightItems) {
object wv = nfkd [(byte) weight];
- if (wv != null)
+ if (wv != null) {
+ if (deferLevel2)
+ level2 = diacritical [(int) wv];
AddCharMap ((char) ((int) wv), category, 0, level2);
+ }
}
}
// update index here.
fillIndex [category] += updateCount;
- if (vertical != char.MinValue)
+ if (vertical != char.MinValue) {
+ if (level2 == 0 && deferLevel2)
+ level2 = diacritical [vertical];
AddCharMap (vertical, category, updateCount, level2);
+ }
}
private void AddCharMapCJK (char c, ref byte category)
// For now it is only for 0x7 category.
private void AddCharMapGroup2 (char c, byte category, byte updateCount, byte level2)
{
- char small = char.MinValue;
- char vertical = char.MinValue;
- Hashtable nfkd = (Hashtable) nfkdMap [(int) c];
- if (nfkd != null) {
- object smv = nfkd [(byte) DecompositionSmall];
- if (smv != null)
- small = (char) ((int) smv);
- object vv = nfkd [(byte) DecompositionVertical];
- if (vv != null)
- vertical = (char) ((int) vv);
+ if (map [(int) c].Defined)
+ return;
+
+ bool updateWeight = false;
+ // Process in advance (lower primary weight)
+ for (int c2 = 0; c2 < char.MaxValue; c2++) {
+ if (!map [c2].Defined &&
+ decompLength [c2] == 1 &&
+ (int) (decompValues [decompIndex [c2]]) == (int) c) {
+ switch (decompType [c2]) {
+ case DecompositionSmall:
+ updateWeight = true;
+ AddCharMap ((char) c2, category,
+ 0, level2);
+ break;
+ }
+ }
}
+ if (updateWeight)
+ fillIndex [category] = (byte)
+ (fillIndex [category] + updateCount);
- // <small> updates index
- if (small != char.MinValue)
- // SPECIAL CASE excluded (FIXME: why?)
- if (small != '\u2024')
- AddCharMap (small, category, updateCount);
+ // Identical weight
+ for (int c2 = 0; c2 < char.MaxValue; c2++) {
+ if (!map [c2].Defined &&
+ decompLength [c2] == 1 &&
+ (int) (decompValues [decompIndex [c2]]) == (int) c) {
+ switch (decompType [c2]) {
+ case DecompositionSub:
+ case DecompositionSuper:
+ case DecompositionWide:
+ case DecompositionNarrow:
+ AddCharMap ((char) c2, category,
+ 0, level2);
+ break;
+ }
+ }
+ }
// itself
AddCharMap (c, category, updateCount, level2);
// Since nfkdMap is problematic to have two or more
// NFKD to an identical character, here I iterate all.
for (int c2 = 0; c2 < char.MaxValue; c2++) {
- if (decompLength [c2] == 1 &&
+ if (!map [c2].Defined &&
+ decompLength [c2] == 1 &&
(int) (decompValues [decompIndex [c2]]) == (int) c) {
switch (decompType [c2]) {
- case DecompositionCompat:
+ case DecompositionWide:
+ case DecompositionNarrow:
+ case DecompositionSmall:
+ case DecompositionSub:
+ case DecompositionSuper:
+ continue;
+ default:
AddCharMap ((char) c2, category, updateCount, level2);
break;
}
}
}
-
- if (vertical != char.MinValue)
- // SPECIAL CASE excluded (FIXME: why?)
- if (vertical != '\uFE33' && vertical != '\uFE34')
- AddCharMap (vertical, category, updateCount, level2);
}
- private void AddArabicCharMap (char c)
+ private void AddArabicCharMap (char c, byte category, byte updateCount, byte level2)
{
- byte category = 6;
- byte updateCount = 1;
- byte level2 = 0;
-
// itself
AddCharMap (c, category, 0, level2);
// CJK compat
if ('\u3192' <= c && c <= '\u319F')
return 0;
- // Japanese reading marks
- if (c == '\u3001' || c == '\u3002')
+
+ // They have <narrow> NFKD mapping, and on Windows
+ // those narrow characters are regarded as "normal",
+ // thus those characters themselves are regarded as
+ // "wide". grep "<narrow>" and you can pick them up
+ // (ignoring Kana, Hangul etc.)
+ switch (c) {
+ case '\u3002':
+ case '\u300C':
+ case '\u300D':
+ case '\u3001':
+ case '\u30FB':
+ case '\u2502':
+ case '\u2190':
+ case '\u2191':
+ case '\u2192':
+ case '\u2193':
+ case '\u25A0':
+ case '\u25CB':
return 1;
+ }
// Korean
if ('\u11A8' <= c && c <= '\u11F9')
return 2;
if ('\u2160' <= c && c <= '\u216F')
return 0x10;
if ('\u2181' <= c && c <= '\u2182')
- return 0x18;
+ return 0x10;
// Arabic
if ('\u2135' <= c && c <= '\u2138')
return 4;
- if ('\uFE80' <= c && c < '\uFF00') {
+ // I believe that Windows has a bug on setting level 3
+ // weight here. NFKD results in different values.
+ if ('\uFE80' < c && c < '\uFF00') {
// 2(Isolated)/8(Final)/0x18(Medial)
switch (decompType [(int) c]) {
case DecompositionIsolated:
- return 2;
+ return 0; // 2;
case DecompositionFinal:
return 8;
case DecompositionMedial:
return 0x18;
+ case DecompositionInitial:
+ return 0x10;
}
}
+ // I have no idea why those symbols have level 3 weight
+ if (c == '\u2104' || c == '\u212B')
+ return 0x18;
+ if ('\u211E' <= c && c <= '\u212B')
+ return 0x10;
+
// actually I dunno the reason why they have weights.
switch (c) {
case '\u01BC':
return 0x20;
case '\u06AA':
return 0x28;
+ // Gurmukhi
+ case '\u0A39':
+ case '\u0A59':
+ case '\u0A5A':
+ case '\u0A5B':
+ case '\u0A5E':
+ return 0x10;
}
byte ret = 0;
switch (c) {
case '\u03C2':
- case '\u2104':
case '\u212B':
- ret |= 8;
+ ret = 8;
break;
case '\uFE42':
- ret |= 0xC;
+ ret = 0xA;
break;
}