// If there are characters whose primary weight is 0, they are consumed
// and considered as a part of the character element.
//
-#define Binary
using System;
using System.IO;
string [] diacritics = new string [] {
// LATIN
- "WITH VERTICAL LINE ABOVE;",
- "WITH GRAVE ACCENT;", "WITH ACUTE ACCENT;", "WITH CIRCUMFLEX ACCENT;",
- "WITH ACUTE;", "WITH GRAVE;", "WITH DOT ABOVE;", " MIDDLE DOT;",
+ "WITH ACUTE;", "WITH GRAVE;", " DOT ABOVE;", " MIDDLE DOT;",
"WITH CIRCUMFLEX;", "WITH DIAERESIS;", "WITH CARON;", "WITH BREVE;",
- " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", "WITH RING ABOVE;",
- "WITH OGONEK;", "WITH CEDILLA;",
- //
+ " DIALYTIKA AND TONOS;", "WITH MACRON;", "WITH TILDE;", " RING ABOVE;",
+ " OGONEK;", " CEDILLA;",
" DOUBLE ACUTE;", " ACUTE AND DOT ABOVE;",
- "WITH STROKE;", " CIRCUMFLEX AND ACUTE;",
- "STROKE OVERLAY",
+ " STROKE;", " CIRCUMFLEX AND ACUTE;",
" DIAERESIS AND ACUTE;", "WITH CIRCUMFLEX AND GRAVE;", " L SLASH;",
" DIAERESIS AND GRAVE;",
" BREVE AND ACUTE;",
" CARON AND DOT ABOVE;", " BREVE AND GRAVE;",
" MACRON AND ACUTE;",
" MACRON AND GRAVE;",
- //
" DIAERESIS AND CARON", " DOT ABOVE AND MACRON", " TILDE AND ACUTE",
" RING ABOVE AND ACUTE",
" DIAERESIS AND MACRON", " CEDILLA AND ACUTE", " MACRON AND DIAERESIS",
" BREVE AND TILDE",
" CEDILLA AND BREVE",
" OGONEK AND MACRON",
- //
- "WITH OVERLINE",
- "WITH HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
+ " HOOK;", "LEFT HOOK;", " WITH HOOK ABOVE;",
" DOUBLE GRAVE;",
" INVERTED BREVE",
- "ROMAN NUMERAL",
" PRECEDED BY APOSTROPHE",
- "WITH HORN;",
+ " HORN;",
" LINE BELOW;", " CIRCUMFLEX AND HOOK ABOVE",
" PALATAL HOOK",
" DOT BELOW;",
" RETROFLEX;", "DIAERESIS BELOW",
" RING BELOW",
- //
" CIRCUMFLEX BELOW", "HORN AND ACUTE",
" BREVE BELOW;", " HORN AND GRAVE",
" TILDE BELOW",
- " TOPBAR",
" DOT BELOW AND DOT ABOVE",
" RIGHT HALF RING", " HORN AND TILDE",
" CIRCUMFLEX AND DOT BELOW",
" HORN AND HOOK ABOVE",
" HORN AND DOT",
// CIRCLED, PARENTHESIZED and so on
- "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN",
- "CIRCLED KATAKANA", "CIRCLED SANS-SERIF",
+ "CIRCLED DIGIT", "CIRCLED NUMBER", "CIRCLED LATIN", "CIRCLED KATAKANA",
"PARENTHESIZED DIGIT", "PARENTHESIZED NUMBER", "PARENTHESIZED LATIN",
};
byte [] diacriticWeights = new byte [] {
// LATIN.
- 5,
- 0xF, 0xE, 0x12,
0xE, 0xF, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16,
0x17, 0x19, 0x1A, 0x1B, 0x1C,
- //
- 0x1D, 0x1D, 0x1E, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
+ 0x1D, 0x1D, 0x1E, 0x1E, 0x1F, 0x1F, 0x1F,
0x20, 0x21, 0x22, 0x22, 0x23, 0x24,
- //
0x25, 0x25, 0x25, 0x26, 0x28, 0x28, 0x28,
0x29, 0x2A, 0x2B, 0x2C, 0x2F, 0x30,
- //
- 0x40, 0x43, 0x43, 0x43, 0x44, 0x46, 0x47, 0x48,
+ 0x43, 0x43, 0x43, 0x44, 0x46, 0x48,
0x52, 0x55, 0x55, 0x57, 0x58, 0x59, 0x59, 0x5A,
- //
- 0x60, 0x60, 0x61, 0x61, 0x63, 0x68, 0x68,
+ 0x60, 0x60, 0x61, 0x61, 0x63, 0x68,
0x69, 0x69, 0x6A, 0x6D, 0x6E,
0x95, 0xAA,
// CIRCLED, PARENTHESIZED and so on.
- 0xEE, 0xEE, 0xEE, 0xEE, 0xEE,
- 0xF3, 0xF3, 0xF3
+ 0xEE, 0xEE, 0xEE, 0xEE, 0xF3, 0xF3, 0xF3, 0xF3
};
int [] numberSecondaryWeightBounds = new int [] {
// based on traditional Tamil consonants, except for
// Grantha (where Microsoft breaks traditionalism).
// http://www.angelfire.com/empire/thamizh/padanGaL
- '\u0B95', '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F',
- '\u0BA3', '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE',
- '\u0BAF', '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4',
- '\u0BB3', '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8',
- '\u0BB7', '\u0BB9'};
+ '\u0B99', '\u0B9A', '\u0B9E', '\u0B9F', '\u0BA3',
+ '\u0BA4', '\u0BA8', '\u0BAA', '\u0BAE', '\u0BAF',
+ '\u0BB0', '\u0BB2', '\u0BB5', '\u0BB4', '\u0BB3',
+ '\u0BB1', '\u0BA9', '\u0B9C', '\u0BB8', '\u0BB7',
+ '\u0BB9'};
// cp -> character name (only for some characters)
ArrayList sortableCharNames = new ArrayList ();
// cp -> level1 value
Hashtable arabicLetterPrimaryValues = new Hashtable ();
- Hashtable cyrillicLetterPrimaryValues = new Hashtable ();
// letterName -> cp
Hashtable arabicNameMap = new Hashtable ();
- Hashtable cyrillicNameMap = new Hashtable ();
// cp -> Hashtable [decompType] -> cp
Hashtable nfkdMap = new Hashtable ();
ArrayList jisJapanese = new ArrayList ();
ArrayList nonJisJapanese = new ArrayList ();
- ushort [] cjkJA = new ushort [char.MaxValue +1];// - 0x4E00];
- ushort [] cjkCHS = new ushort [char.MaxValue +1];// - 0x3100];
- ushort [] cjkCHT = new ushort [char.MaxValue +1];// - 0x4E00];
- ushort [] cjkKO = new ushort [char.MaxValue +1];// - 0x4E00];
- byte [] cjkKOlv2 = new byte [char.MaxValue +1];// - 0x4E00];
+ ushort [] cjkJA = new ushort [char.MaxValue - 0x4E00];
+ ushort [] cjkCHS = new ushort [char.MaxValue - 0x3100];
+ ushort [] cjkCHT = new ushort [char.MaxValue - 0x4E00];
+ ushort [] cjkKO = new ushort [char.MaxValue - 0x4E00];
+ byte [] cjkKOlv2 = new byte [char.MaxValue - 0x4E00];
byte [] ignorableFlags = new byte [char.MaxValue + 1];
source, typeof (byte), i);
}
- ushort [] CompressArray (ushort [] source, CodePointIndexer i)
- {
- return (ushort []) CodePointIndexer.CompressArray (
- source, typeof (ushort), i);
- }
-
void Serialize ()
{
// Tailorings
byte [] level1 = new byte [map.Length];
byte [] level2 = new byte [map.Length];
byte [] level3 = new byte [map.Length];
- ushort [] widthCompat = new ushort [map.Length];
+ int [] widthCompat = new int [map.Length];
for (int i = 0; i < map.Length; i++) {
categories [i] = map [i].Category;
level1 [i] = map [i].Level1;
case DecompositionSuper:
case DecompositionSub:
// they are always 1 char
- widthCompat [i] = (ushort) decompValues [decompIndex [i]];
+ widthCompat [i] = decompValues [decompIndex [i]];
break;
}
}
MSCompatUnicodeTableUtil.Level2);
level3 = CompressArray (level3,
MSCompatUnicodeTableUtil.Level3);
- widthCompat = (ushort []) CodePointIndexer.CompressArray (
- widthCompat, typeof (ushort),
+ widthCompat = (int []) CodePointIndexer.CompressArray (
+ widthCompat, typeof (int),
MSCompatUnicodeTableUtil.WidthCompat);
- cjkCHS = CompressArray (cjkCHS,
- MSCompatUnicodeTableUtil.CjkCHS);
- cjkCHT = CompressArray (cjkCHT,
- MSCompatUnicodeTableUtil.Cjk);
- cjkJA = CompressArray (cjkJA,
- MSCompatUnicodeTableUtil.Cjk);
- cjkKO = CompressArray (cjkKO,
- MSCompatUnicodeTableUtil.Cjk);
- cjkKOlv2 = CompressArray (cjkKOlv2,
- MSCompatUnicodeTableUtil.Cjk);
// Ignorables
- Result.WriteLine ("internal static readonly byte [] ignorableFlags = new byte [] {");
-#if Binary
- MemoryStream ms = new MemoryStream ();
- BinaryWriter binary = new BinaryWriter (ms);
- binary.Write (ignorableFlags.Length);
-#endif
+ Result.WriteLine ("static byte [] ignorableFlags = new byte [] {");
for (int i = 0; i < ignorableFlags.Length; i++) {
byte value = ignorableFlags [i];
if (value < 10)
Result.Write ("{0},", value);
else
Result.Write ("0x{0:X02},", value);
-#if Binary
- binary.Write (value);
-#endif
if ((i & 0xF) == 0xF)
Result.WriteLine ("// {0:X04}", i - 0xF);
}
Result.WriteLine ();
// Primary category
- Result.WriteLine ("internal static readonly byte [] categories = new byte [] {");
-#if Binary
- binary.Write (categories.Length);
-#endif
+ Result.WriteLine ("static byte [] categories = new byte [] {");
for (int i = 0; i < categories.Length; i++) {
byte value = categories [i];
if (value < 10)
Result.Write ("{0},", value);
else
Result.Write ("0x{0:X02},", value);
-#if Binary
- binary.Write (value);
-#endif
if ((i & 0xF) == 0xF)
Result.WriteLine ("// {0:X04}", i - 0xF);
}
Result.WriteLine ();
// Primary weight value
- Result.WriteLine ("internal static readonly byte [] level1 = new byte [] {");
-#if Binary
- binary.Write (level1.Length);
-#endif
+ Result.WriteLine ("static byte [] level1 = new byte [] {");
for (int i = 0; i < level1.Length; i++) {
byte value = level1 [i];
if (value < 10)
Result.Write ("{0},", value);
else
Result.Write ("0x{0:X02},", value);
-#if Binary
- binary.Write (value);
-#endif
if ((i & 0xF) == 0xF)
Result.WriteLine ("// {0:X04}", i - 0xF);
}
Result.WriteLine ();
// Secondary weight
- Result.WriteLine ("internal static readonly byte [] level2 = new byte [] {");
-#if Binary
- binary.Write (level2.Length);
-#endif
+ Result.WriteLine ("static byte [] level2 = new byte [] {");
for (int i = 0; i < level2.Length; i++) {
- byte value = level2 [i];
+ int value = level2 [i];
if (value < 10)
Result.Write ("{0},", value);
else
Result.Write ("0x{0:X02},", value);
-#if Binary
- binary.Write (value);
-#endif
if ((i & 0xF) == 0xF)
Result.WriteLine ("// {0:X04}", i - 0xF);
}
Result.WriteLine ();
// Thirtiary weight
- Result.WriteLine ("internal static readonly byte [] level3 = new byte [] {");
-#if Binary
- binary.Write (level3.Length);
-#endif
+ Result.WriteLine ("static byte [] level3 = new byte [] {");
for (int i = 0; i < level3.Length; i++) {
byte value = level3 [i];
if (value < 10)
Result.Write ("{0},", value);
else
Result.Write ("0x{0:X02},", value);
-#if Binary
- binary.Write (value);
-#endif
if ((i & 0xF) == 0xF)
Result.WriteLine ("// {0:X04}", i - 0xF);
}
// Width insensitivity mappings
// (for now it is more lightweight than dumping the
// entire NFKD table).
- Result.WriteLine ("internal static readonly ushort [] widthCompat = new ushort [] {");
-#if Binary
- binary.Write (widthCompat.Length);
-#endif
+ Result.WriteLine ("static int [] widthCompat = new int [] {");
for (int i = 0; i < widthCompat.Length; i++) {
- ushort value = widthCompat [i];
+ int value = widthCompat [i];
if (value < 10)
Result.Write ("{0},", value);
else
Result.Write ("0x{0:X02},", value);
-#if Binary
- binary.Write (value);
-#endif
if ((i & 0xF) == 0xF)
Result.WriteLine ("// {0:X04}", i - 0xF);
}
Result.WriteLine ("};");
Result.WriteLine ();
-#if Binary
- using (FileStream fs = File.Create ("../collation.core.bin")) {
- byte [] array = ms.ToArray ();
- fs.Write (array, 0, array.Length);
- }
-#endif
// CJK
SerializeCJK ("cjkCHS", cjkCHS, char.MaxValue);
void SerializeCJK (string name, ushort [] cjk, int max)
{
- int offset = 0;//char.MaxValue - cjk.Length;
+ int offset = char.MaxValue - cjk.Length;
Result.WriteLine ("static ushort [] {0} = new ushort [] {{", name);
-#if Binary
- MemoryStream ms = new MemoryStream ();
- BinaryWriter binary = new BinaryWriter (ms);
-#endif
for (int i = 0; i < cjk.Length; i++) {
if (i + offset == max)
break;
Result.Write ("{0},", value);
else
Result.Write ("0x{0:X04},", value);
-#if Binary
- binary.Write (value);
-#endif
if ((i & 0xF) == 0xF)
Result.WriteLine ("// {0:X04}", i - 0xF + offset);
}
Result.WriteLine ("};");
Result.WriteLine ();
-#if Binary
- using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
- byte [] array = ms.ToArray ();
- fs.Write (array, 0, array.Length);
- }
-#endif
}
void SerializeCJK (string name, byte [] cjk, int max)
{
- int offset = 0;//char.MaxValue - cjk.Length;
+ int offset = char.MaxValue - cjk.Length;
Result.WriteLine ("static byte [] {0} = new byte [] {{", name);
-#if Binary
- MemoryStream ms = new MemoryStream ();
- BinaryWriter binary = new BinaryWriter (ms);
-#endif
for (int i = 0; i < cjk.Length; i++) {
if (i + offset == max)
break;
Result.Write ("{0},", value);
else
Result.Write ("0x{0:X02},", value);
-#if Binary
- binary.Write (value);
-#endif
if ((i & 0xF) == 0xF)
Result.WriteLine ("// {0:X04}", i - 0xF + offset);
}
Result.WriteLine ("};");
Result.WriteLine ();
-#if Binary
- using (FileStream fs = File.Create (String.Format ("../collation.{0}.bin", name))) {
- byte [] array = ms.ToArray ();
- fs.Write (array, 0, array.Length);
- }
-#endif
}
void SerializeTailorings ()
Hashtable counts = new Hashtable ();
Result.WriteLine ("static char [] tailorings = new char [] {");
int count = 0;
-#if Binary
- MemoryStream ms = new MemoryStream ();
- BinaryWriter binary = new BinaryWriter (ms);
-#endif
foreach (Tailoring t in tailorings) {
if (t.Alias != 0)
continue;
Result.Write ("'\\x{0:X}', ", (int) c);
if (++count % 16 == 0)
Result.WriteLine (" // {0:X04}", count - 16);
-#if Binary
- binary.Write ((ushort) c);
-#endif
}
}
Result.WriteLine ("};");
Result.WriteLine ("static TailoringInfo [] tailoringInfos = new TailoringInfo [] {");
-#if Binary
- byte [] rawdata = ms.ToArray ();
- ms = new MemoryStream ();
- binary = new BinaryWriter (ms);
- binary.Write (tailorings.Count);
-#endif
foreach (Tailoring t in tailorings) {
int target = t.Alias != 0 ? t.Alias : t.LCID;
if (!indexes.ContainsKey (target)) {
- throw new Exception (String.Format ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias));
+ Console.Error.WriteLine ("WARNING: no corresponding definition for tailoring alias. From {0} to {1}", t.LCID, t.Alias);
continue;
}
int idx = (int) indexes [target];
if (t2.LCID == t.LCID)
french = t2.FrenchSort;
Result.WriteLine ("new TailoringInfo ({0}, 0x{1:X}, {2}, {3}), ", t.LCID, idx, cnt, french ? "true" : "false");
-#if Binary
- binary.Write (t.LCID);
- binary.Write (idx);
- binary.Write (cnt);
- binary.Write (french);
-#endif
}
Result.WriteLine ("};");
-#if Binary
- binary.Write ((byte) 0xFF);
- binary.Write ((byte) 0xFF);
- binary.Write (rawdata.Length / 2);
- binary.Write (rawdata, 0, rawdata.Length);
-
-
- using (FileStream fs = File.Create ("../collation.tailoring.bin")) {
- byte [] array = ms.ToArray ();
- fs.Write (array, 0, array.Length);
- }
-#endif
}
#region Parse
if (idx > 0) {
string source = s.Substring (0, idx).Trim ();
string [] l = s.Substring (idx + 1).Trim ().Split (' ');
- byte [] b = new byte [4];
- for (int i = 0; i < 4; i++) {
+ byte [] b = new byte [5];
+ for (int i = 0; i < 5; i++) {
if (l [i] == "*")
b [i] = 0;
else
if (cp > char.MaxValue)
continue;
- double v = double.Parse (value);
for (int i = cp; i <= cpEnd; i++)
- unicodeAge [i] = v;
+ unicodeAge [i] = double.Parse (value);
}
}
unicodeAge [0] = double.MaxValue; // never be supported
this.decompValues = (int [])
decompValues.ToArray (typeof (int));
}
-
- char previousLatinTarget = char.MinValue;
- byte [] diacriticalOffset = new byte ['Z' - 'A' + 1];
-
+
void ProcessUnidataLine (string s, ArrayList decompValues)
{
int idx = s.IndexOf ('#');
string name = values [0];
- // SPECIAL CASE: rename some characters for diacritical
- // remapping. FIXME: why are they different?
- // FIXME: it's still not working.
- if (cp == 0x018B || cp == 0x018C)
- name = name.Replace ("TOPBAR", "STROKE");
-
// isSmallCapital
if (s.IndexOf ("SMALL CAPITAL") > 0)
isSmallCapital [cp] = true;
// latin mapping by character name
- if (s.IndexOf ("LATIN") >= 0) {
+ if (s.IndexOf ("LATIN") > 0) {
int lidx = s.IndexOf ("LETTER DOTLESS ");
int offset = lidx + 15;
if (lidx < 0) {
lidx = s.IndexOf ("LETTER TURNED ");
offset = lidx + 14;
}
- if (lidx < 0) {
- lidx = s.IndexOf ("LETTER CAPITAL ");
- offset = lidx + 15;
- }
- if (lidx < 0) {
- lidx = s.IndexOf ("LETTER SCRIPT ");
- offset = lidx + 14;
- }
if (lidx < 0) {
lidx = s.IndexOf ("LETTER ");
offset = lidx + 7;
}
char c = lidx > 0 ? s [offset] : char.MinValue;
- char n = s [offset + 1];
- char target = char.MinValue;
if ('A' <= c && c <= 'Z' &&
- (n == ' ') || n == ';') {
- target = c;
- // FIXME: After 'Z', I cannot reset this state.
- previousLatinTarget = c == 'Z' ? char.MinValue : c;
- }
-
- if (s.Substring (offset).StartsWith ("ALPHA"))
- target = 'A';
- else if (s.Substring (offset).StartsWith ("TONE SIX"))
- target = 'B';
- else if (s.Substring (offset).StartsWith ("OPEN O"))
- target = 'C';
- else if (s.Substring (offset).StartsWith ("SCHWA"))
- target = 'E';
- else if (s.Substring (offset).StartsWith ("ENG"))
- target = 'N';
- else if (s.Substring (offset).StartsWith ("OI;")) // 01A2,01A3
- target = 'O';
- else if (s.Substring (offset).StartsWith ("YR;")) // 01A2,01A3
- target = 'R';
- else if (s.Substring (offset).StartsWith ("TONE TWO"))
- target = 'S';
- else if (s.Substring (offset).StartsWith ("ESH"))
- target = 'S';
-
- if (target == char.MinValue)
- target = previousLatinTarget;
-
- if (target != char.MinValue) {
- ArrayList entry = (ArrayList) latinMap [target];
+ (s.Length == offset + 1 || s [offset + 1] == ' ')) {
+ ArrayList entry = (ArrayList) latinMap [c];
if (entry == null) {
entry = new ArrayList ();
- latinMap [target] = entry;
+ latinMap [c] = entry;
}
entry.Add (cp);
- // FIXME: This secondary weight is hack.
- // They are here because they must not
- // be identical to the corresponding
- // ASCII latins.
- if (c != target && diacritical [cp] == 0) {
- diacriticalOffset [c - 'A']++;
- diacritical [cp] = (byte) (diacriticalOffset [c - 'A'] + 0x7C);
- }
}
}
value = 0x18;
else
value = 0x19;
- } else if (s.IndexOf ("SHADE") > 0)
- value = 0x19;
- // SPECIAL CASE: BOX DRAWING DIAGONAL patterns
- switch (cp) {
- case 0x2571: value = 0xF; break;
- case 0x2572: value = 0x10; break;
- case 0x2573: value = 0x11; break;
}
if (value >= 0)
boxValues.Add (new DictionaryEntry (
if (0x2100 <= cp && cp <= 0x213F &&
Char.IsSymbol ((char) cp))
sortableCharNames.Add (
- new DictionaryEntry (cp, name));
+ new DictionaryEntry (cp, values [0]));
else if (0x3380 <= cp && cp <= 0x33DD)
sortableCharNames.Add (new DictionaryEntry (
- cp, name.Substring (7)));
+ cp, values [0].Substring (7)));
// diacritical weights by character name
-if (diacritics.Length != diacriticWeights.Length)
-throw new Exception (String.Format ("Should not happen. weights are {0} while labels are {1}", diacriticWeights.Length, diacritics.Length));
- for (int d = 0; d < diacritics.Length; d++) {
- if (s.IndexOf (diacritics [d]) > 0) {
- diacritical [cp] += diacriticWeights [d];
- if (s.IndexOf ("COMBINING") >= 0)
- diacritical [cp] -= (byte) 2;
- continue;
- }
- // also process "COMBINING blah" here
- // For now it is limited to cp < 0x0370
-// if (cp < 0x0300 || cp >= 0x0370)
-// continue;
- string tmp = diacritics [d].TrimEnd (';');
- if (tmp.IndexOf ("WITH ") == 0)
- tmp = tmp.Substring (4);
- tmp = String.Concat ("COMBINING", (tmp [0] != ' ' ? " " : ""), tmp);
- if (name == tmp)
- diacritical [cp] = (byte) (diacriticWeights [d] - 2);
-//if (name == tmp)
-//Console.Error.WriteLine ("======= {2:X04} : '{0}' / '{1}'", name, tmp, cp);
- }
+ for (int d = 0; d < diacritics.Length; d++)
+ if (s.IndexOf (diacritics [d]) > 0)
+ diacritical [cp] |= diacriticWeights [d];
// Two-step grep required for it.
if (s.IndexOf ("FULL STOP") > 0 &&
(s.IndexOf ("DIGIT") > 0 || s.IndexOf ("NUMBER") > 0))
diacritical [cp] |= 0xF4;
- // Cyrillic letter name
- if (0x0430 <= cp && cp <= 0x0486 &&
- Char.IsLetter ((char) cp)) {
- byte value = (byte) (cyrillicNameMap.Count * 3 + 0x06);
- // Get primary letter name i.e.
- // XXX part of CYRILLIC LETTER XXX yyy
- // e.g. "IZHITSA" for "IZHITSA DOUBLE GRAVE".
- string letterName =
- name.Substring (name.IndexOf ("LETTER ") + 7);
- int tmpIdx = letterName.IndexOf (' ');
- letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
-//Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
- if (cyrillicNameMap.ContainsKey (letterName))
- value = (byte) cyrillicLetterPrimaryValues [cyrillicNameMap [letterName]];
- else
- cyrillicNameMap [letterName] = cp;
-
- cyrillicLetterPrimaryValues [cp] = value;
- }
-
// Arabic letter name
if (0x0621 <= cp && cp <= 0x064A &&
Char.GetUnicodeCategory ((char) cp)
(cp == 0x0640) ?
// 0x0640 is special: it does
// not start with ARABIC LETTER
- name :
- name.Substring (14);
+ values [0] :
+ values [0].Substring (14);
int tmpIdx = letterName.IndexOf (' ');
letterName = tmpIdx < 0 ? letterName : letterName.Substring (0, tmpIdx);
//Console.Error.WriteLine ("Arabic name for {0:X04} is {1}", cp, letterName);
// Japanese square letter
if (0x3300 <= cp && cp <= 0x3357)
if (!ExistsJIS (cp))
- nonJisJapanese.Add (new NonJISCharacter (cp, name));
+ nonJisJapanese.Add (new NonJISCharacter (cp, values [0]));
// normalizationType
string decomp = values [4];
void ParseJISOrder (string filename)
{
- int line = 1;
- try {
- using (StreamReader file =
- new StreamReader (filename)) {
- for (;file.Peek () >= 0; line++)
- ProcessJISOrderLine (file.ReadLine ());
+ using (StreamReader file =
+ new StreamReader (filename)) {
+ while (file.Peek () >= 0) {
+ string s = file.ReadLine ();
+ int idx = s.IndexOf ('#');
+ if (idx >= 0)
+ s = s.Substring (0, idx).Trim ();
+ if (s.Length == 0)
+ continue;
+ idx = s.IndexOf (' ');
+ if (idx < 0)
+ continue;
+ // They start with "0x" so cut them out.
+ int jis = int.Parse (s.Substring (2, idx), NumberStyles.HexNumber);
+ int cp = int.Parse (s.Substring (idx + 3).Trim (), NumberStyles.HexNumber);
+ jisJapanese.Add (new JISCharacter (cp, jis));
}
- } catch (Exception) {
- Console.Error.WriteLine ("---- line {0}", line);
- throw;
}
}
- char [] ws = new char [] {'\t', ' '};
-
- void ProcessJISOrderLine (string s)
- {
- int idx = s.IndexOf ('#');
- if (idx >= 0)
- s = s.Substring (0, idx).Trim ();
- if (s.Length == 0)
- return;
- idx = s.IndexOfAny (ws);
- if (idx < 0)
- return;
- // They start with "0x" so cut them out.
- int jis = int.Parse (s.Substring (2, idx - 2), NumberStyles.HexNumber);
- int cp = int.Parse (s.Substring (idx).Trim ().Substring (2), NumberStyles.HexNumber);
- jisJapanese.Add (new JISCharacter (cp, jis));
- }
-
void ParseCJK (string zhXML, string jaXML, string koXML)
{
XmlDocument doc = new XmlDocument ();
// Chinese Simplified
category = "chs";
arr = cjkCHS;
- offset = 0;//char.MaxValue - arr.Length;
+ offset = char.MaxValue - arr.Length;
doc.Load (zhXML);
s = doc.SelectSingleNode ("/ldml/collations/collation[@type='pinyin']/rules/pc").InnerText;
v = 0x8008;
// Chinese Traditional
category = "cht";
arr = cjkCHT;
- offset = 0;//char.MaxValue - arr.Length;
+ offset = char.MaxValue - arr.Length;
s = doc.SelectSingleNode ("/ldml/collations/collation[@type='stroke']/rules/pc").InnerText;
v = 0x8002;
foreach (char c in s) {
// Japanese
category = "ja";
arr = cjkJA;
- offset = 0;//char.MaxValue - arr.Length;
+ offset = char.MaxValue - arr.Length;
doc.Load (jaXML);
s = doc.SelectSingleNode ("/ldml/collations/collation/rules/pc").InnerText;
v = 0x8008;
//
category = "ko";
arr = cjkKO;
- offset = 0;//char.MaxValue - arr.Length;
+ offset = char.MaxValue - arr.Length;
doc.Load (koXML);
foreach (XmlElement reset in doc.SelectNodes ("/ldml/collations/collation/rules/reset")) {
XmlElement sc = (XmlElement) reset.NextSibling;
if (Char.IsNumber ((char) cp))
diacritical [cp] = weight;
- // Modify some decomposition equivalence
- decompType [0xFE31] = 0;
- decompIndex [0xFE31] = 0;
- decompLength [0xFE31] = 0;
- decompType [0xFE32] = 0;
- decompIndex [0xFE32] = 0;
- decompLength [0xFE32] = 0;
-
// Korean parens numbers
for (int i = 0x3200; i <= 0x321C; i++)
diacritical [i] = 0xA;
// Hyphen/Dash : 06 81 - 06 90
for (int i = 0; i < char.MaxValue; i++) {
- if (!IsIgnorable (i) &&
- Char.GetUnicodeCategory ((char) i) ==
- UnicodeCategory.DashPunctuation) {
- AddCharMapGroup2 ((char) i, 6, 1, 0);
- if (i == 0x2011) {
- // SPECIAL: add 2027 and 2043
- // Maybe they are regarded the
- // same hyphens in "central"
- // position.
- AddCharMap ('\u2027', 6, 1);
- AddCharMap ('\u2043', 6, 1);
- }
- }
+ if (Char.GetUnicodeCategory ((char) i)
+ == UnicodeCategory.DashPunctuation)
+ AddCharMapGroupTail ((char) i, 6, 1);
}
// Arabic variable weight chars 06 A0 -
fillIndex [6] = 0xA0;
// vowels
for (int i = 0x64B; i <= 0x650; i++)
- AddArabicCharMap ((char) i);
+ AddCharMapGroupTail ((char) i, 6, 1);
// sukun
AddCharMapGroup ('\u0652', 6, 1, 0);
// shadda
if (!IsIgnorable (i))
AddCharMap ((char) i, 0x1, 1);
- // FIXME: needs more love here (it should eliminate
- // all the hacky code above).
- for (int i = 0x0300; i < 0x0370; i++)
- if (!IsIgnorable (i) && diacritical [i] != 0
- /* especiall here*/ && !map [i].Defined)
- map [i] = new CharMapEntry (
- 0x1, 0x1, diacritical [i]);
-
- fillIndex [0x1] = 0xAC;
- for (int i = 0x07A6; i <= 0x07B0; i++)
- if (!IsIgnorable (i))
- AddCharMap ((char) i, 0x1, 1);
-
// LAMESPEC: It should not stop at '\u20E1'. There are
// a few more characters (that however results in
// overflow of level 2 unless we start before 0xDD).
AddCharMap ('\u2423', 0x7, 1, 0); // open box
#endregion
- // category 09 - continued symbols from 08
+ // FIXME: 09 should be more complete.
fillIndex [0x9] = 2;
// misc tech mark
for (int cp = 0x2300; cp <= 0x237A; cp++)
fillIndex [0xC]++;
int xcp;
- if (currValue <= 10) {
- xcp = (int) prevValue + 0x2170 - 1;
- AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
- xcp = (int) prevValue + 0x2160 - 1;
- AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
- fillIndex [0xC] += 2;
- xcp = (int) prevValue + 0x3021 - 1;
- AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
- fillIndex [0xC]++;
- }
- else if (currValue == 11)
- fillIndex [0xC]++;
+ xcp = (int) prevValue + 0x2170 - 1;
+ AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
+ xcp = (int) prevValue + 0x2160 - 1;
+ AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
+ fillIndex [0xC] += 2;
+ xcp = (int) prevValue + 0x3021 - 1;
+ AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
+ fillIndex [0xC]++;
}
if (prevValue < currValue)
prevValue = currValue;
else if (cp == 0x3021) // FIXME: why?
fillIndex [0xC]++;
AddCharMapGroup ((char) cp, 0xC, 0, diacritical [cp]);
+
if (addnew || cp <= '9') {
- int mod = (int) currValue - 1;
int xcp;
if (1 <= currValue && currValue <= 10) {
- xcp = mod + 0x2776;
+ xcp = cp - 0x31 + 0x2776;
AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
- xcp = mod + 0x2780;
+ xcp = cp - 0x31 + 0x2780;
AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
- xcp = mod + 0x278A;
+ xcp = cp - 0x31 + 0x278A;
AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
}
if (1 <= currValue && currValue <= 20) {
- xcp = mod + 0x2460;
+ xcp = cp - 0x31 + 0x2460;
AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
- xcp = mod + 0x2474;
+ xcp = cp - 0x31 + 0x2474;
AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
- xcp = mod + 0x2488;
+ xcp = cp - 0x31 + 0x2488;
AddCharMap ((char) xcp, 0xC, 0, diacritical [xcp]);
}
}
// but inside a-to-z range.
// 3.there are some expanded characters that
// are not part of Unicode Standard NFKD.
- // 4. some characters are letter in IsLetter
- // but not in sortkeys (maybe unicode version
- // difference caused it).
switch (i) {
// 1. skipping them does not make sense
// case 0xD0: case 0xF0: case 0x131: case 0x138:
case 0xFE: // Icelandic Thorn
case 0xDF: // German ss
case 0xFF: // German ss
- // 4.
- case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
// not classified yet
// case 0x1A6: case 0x1A7: case 0x1A8: case 0x1A9:
// case 0x1AA: case 0x1B1: case 0x1B7: case 0x1B8:
// case 0x1B9: case 0x1BA: case 0x1BB: case 0x1BF:
+// case 0x1C0: case 0x1C1: case 0x1C2: case 0x1C3:
// case 0x1DD:
continue;
}
if (Char.IsLetter ((char) i))
AddLetterMap ((char) i, 0xF, 1);
- // Cyrillic - character name order
- fillIndex [0x10] = 0x6;
-//*
-for (int i = 0; i < orderedCyrillic.Length; i++)
-Console.Error.WriteLine ("----- {0:x04}", (int) orderedCyrillic [i]);
-
+ // Cyrillic - UCA order w/ some modification
+ fillIndex [0x10] = 0x3;
// table which is moslty from UCA DUCET.
for (int i = 0; i < orderedCyrillic.Length; i++) {
- char c = Char.ToUpper (orderedCyrillic [i], CultureInfo.InvariantCulture);
- if (!IsIgnorable ((int) c) &&
- c <= '\u045C' &&
- Char.IsLetter (c)) {
- AddLetterMap (c, 0x10, 0);
- fillIndex [0x10] += 3;
- }
+ char c = orderedCyrillic [i];
+ if (Char.IsLetter (c))
+ AddLetterMap (c, 0x10, 3);
}
- /*
for (int i = 0x0460; i < 0x0481; i++) {
- if (Char.IsLetter ((char) i)) {
- AddLetterMap ((char) i, 0x10, 0);
- fillIndex [0x10] += 3;
- }
- }
- */
-/*
- for (int i = 0x0400; i <= 0x0486; i++) {
- if (!Char.IsLetter ((char) i)) {
-// AddCharMap ((char) i, 0x1, 1);
- continue;
- }
- if (!cyrillicLetterPrimaryValues.ContainsKey (i)) {
- Console.Error.WriteLine ("no value for {0:x04}", i);
- continue;
- }
- fillIndex [0x10] =
- (byte) cyrillicLetterPrimaryValues [i];
- AddLetterMap ((char) i, 0x10, 0);
+ if (Char.IsLetter ((char) i))
+ AddLetterMap ((char) i, 0x10, 3);
}
-*/
// Armenian
fillIndex [0x11] = 0x3;
if (!IsIgnorable (i))
AddLetterMap ((char) i, 0x14, 2);
fillIndex [0x14] = 0xB;
- for (int i = 0x0905; i < 0x093A; i++) {
- if (i == 0x0928)
- AddCharMap ('\u0929', 0x14, 0, 8);
- if (i == 0x0930)
- AddCharMap ('\u0931', 0x14, 0, 8);
- if (i == 0x0933)
- AddCharMap ('\u0934', 0x14, 0, 8);
+ for (int i = 0x0905; i < 0x093A; i++)
if (Char.IsLetter ((char) i))
AddLetterMap ((char) i, 0x14, 4);
- if (i == 0x090B)
- AddCharMap ('\u0960', 0x14, 4);
- if (i == 0x090C)
- AddCharMap ('\u0961', 0x14, 4);
- }
- fillIndex [0x14] = 0xDA;
- for (int i = 0x093E; i < 0x0945; i++)
- if (!IsIgnorable (i))
- AddLetterMap ((char) i, 0x14, 2);
- fillIndex [0x14] = 0xEC;
- for (int i = 0x0945; i < 0x094F; i++)
+ for (int i = 0x093E; i < 0x094F; i++)
if (!IsIgnorable (i))
AddLetterMap ((char) i, 0x14, 2);
// Gurmukhi. orderedGurmukhi is from UCA
// FIXME: it does not look equivalent to UCA.
- fillIndex [0x16] = 04;
- fillIndex [0x1] = 3;
+ fillIndex [0x1] = 03;
+ fillIndex [0x16] = 02;
for (int i = 0; i < orderedGurmukhi.Length; i++) {
char c = orderedGurmukhi [i];
if (IsIgnorable ((int) c))
continue;
- if (IsIgnorableNonSpacing (c)) {
+ if (!Char.IsLetter (c)) {
AddLetterMap (c, 0x1, 1);
continue;
}
if (c == '\u0A3C' || c == '\u0A4D' ||
'\u0A66' <= c && c <= '\u0A71')
continue;
- // SPECIAL CASE: U+A38 = U+A36 at primary level (why?)
- byte shift = 4;
- if (c == '\u0A36' || c == '\u0A16' || c == '\u0A17' || c == '\u0A5B' || c == '\u0A5E')
- shift = 0;
- AddLetterMap (c, 0x16, shift);
+ AddLetterMap (c, 0x16, 4);
}
// Gujarati. orderedGujarati is from UCA
- fillIndex [0x17] = 0x4;
- // nonspacing marks
- map [0x0A4D] = new CharMapEntry (1, 0, 0x3);
- map [0x0ABD] = new CharMapEntry (1, 0, 0x3);
- map [0x0A3C] = new CharMapEntry (1, 0, 0x4);
- map [0x0A71] = new CharMapEntry (1, 0, 0x6);
- map [0x0ABC] = new CharMapEntry (1, 0, 0xB);
- map [0x0A70] = new CharMapEntry (1, 0, 0xE);
- // letters go first.
- for (int i = 0; i < orderedGujarati.Length; i++) {
- // SPECIAL CASE
- char c = orderedGujarati [i];
- if (Char.IsLetter (c)) {
- // SPECIAL CASES
- if (c == '\u0AB3' || c == '\u0A32')
- continue;
- if (c == '\u0A33') {
- AddCharMap ('\u0A32', 0x17, 0);
- AddCharMap ('\u0A33', 0x17, 4, 4);
- continue;
- }
- if (c == '\u0A8B')
- AddCharMap ('\u0AE0', 0x17, 0, 5);
- AddCharMap (c, 0x17, 4);
-
- if (c == '\u0AB9')
- AddCharMap ('\u0AB3', 0x17, 6);
- }
- }
- // non-letters
- byte gujaratiShift = 4;
- fillIndex [0x17] = 0xC0;
- for (int i = 0; i < orderedGujarati.Length; i++) {
- char c = orderedGujarati [i];
- if (fillIndex [0x17] == 0xCC)
- gujaratiShift = 3;
- if (!Char.IsLetter (c)) {
- // SPECIAL CASES
- if (c == '\u0A82')
- AddCharMap ('\u0A81', 0x17, 2);
- if (c == '\u0AC2')
- fillIndex [0x17]++;
- AddLetterMap (c, 0x17, gujaratiShift);
- }
- }
+ fillIndex [0x17] = 02;
+ for (int i = 0; i < orderedGujarati.Length; i++)
+ AddLetterMap (orderedGujarati [i], 0x17, 4);
// Oriya
- fillIndex [0x1] = 03;
fillIndex [0x18] = 02;
for (int i = 0x0B00; i < 0x0B7F; i++) {
switch (Char.GetUnicodeCategory ((char) i)) {
case UnicodeCategory.NonSpacingMark:
case UnicodeCategory.DecimalDigitNumber:
- AddLetterMap ((char) i, 0x1, 1);
continue;
}
AddLetterMap ((char) i, 0x18, 1);
AddCharMap ('\u0BD7', 0x19, 0);
fillIndex [0x19] = 0xA;
// vowels
- for (int i = 0x0B82; i <= 0x0B94; i++)
- if (!IsIgnorable ((char) i))
+ for (int i = 0x0BD7; i < 0x0B94; i++)
+ if (Char.IsLetter ((char) i))
AddCharMap ((char) i, 0x19, 2);
// special vowel
- fillIndex [0x19] = 0x28;
+ fillIndex [0x19] = 0x24;
+ AddCharMap ('\u0B94', 0x19, 0);
+ fillIndex [0x19] = 0x26;
// The array for Tamil consonants is a constant.
// Windows have almost similar sequence to TAM from
// tamilnet but a bit different in Grantha.
for (int i = 0x0C80; i < 0x0CE5; i++) {
if (i == 0x0CD5 || i == 0x0CD6)
continue; // ignore
- if (i == 0x0CB1 || i == 0x0CB3 || i == 0x0CDE)
- continue; // shift after 0xCB9
AddCharMap ((char) i, 0x1B, 3);
- if (i == 0x0CB9) {
- // SPECIAL CASES: but why?
- AddCharMap ('\u0CB1', 0x1B, 3); // RRA
- AddCharMap ('\u0CB3', 0x1B, 3); // LLA
- AddCharMap ('\u0CDE', 0x1B, 3); // FA
- }
- if (i == 0x0CB2)
- AddCharMap ('\u0CE1', 0x1B, 3); // vocalic LL
}
// Malayalam
// Thai ... note that it breaks 0x1E wall after E2B!
// Also, all Thai characters have level 2 value 3.
fillIndex [0x1E] = 2;
- for (int i = 0xE40; i <= 0xE44; i++)
+ for (int i = 0xE44; i < 0xE48; i++)
AddCharMap ((char) i, 0x1E, 1, 3);
for (int i = 0xE01; i < 0xE2B; i++)
- AddCharMap ((char) i, 0x1E, 6, 3);
+ AddCharMap ((char) i, 0x1E, 6, 0);
fillIndex [0x1F] = 5;
for (int i = 0xE2B; i < 0xE30; i++)
- AddCharMap ((char) i, 0x1F, 6, 3);
- fillIndex [0x1F] = 0x1E;
+ AddCharMap ((char) i, 0x1F, 6, 0);
for (int i = 0xE30; i < 0xE3B; i++)
AddCharMap ((char) i, 0x1F, 1, 3);
// some Thai characters remains.
// Georgian. orderedGeorgian is from UCA DUCET.
fillIndex [0x21] = 5;
- for (int i = 0; i < orderedGeorgian.Length; i++) {
- char c = orderedGeorgian [i];
- if (map [(int) c].Defined)
- continue;
- AddCharMap (c, 0x21, 0);
- if (c < '\u10F6')
- AddCharMap ((char) (c - 0x30), 0x21, 0);
- fillIndex [0x21] += 5;
- }
+ for (int i = 0; i < orderedGeorgian.Length; i++)
+ AddLetterMap (orderedGeorgian [i], 0x21, 5);
// Japanese Kana.
fillIndex [0x22] = 2;
AddKanaMap (cp, kanaLines [gyo]);
fillIndex [0x22]++;
- if (cp == 0x30AB) {
- // add small 'ka' (before normal one)
- AddKanaMap (0x30F5, 1);
- kanaOffset++;
- }
- if (cp == 0x30B1) {
- // add small 'ke' (before normal one)
- AddKanaMap (0x30F6, 1);
- kanaOffset++;
- }
if (cp == 0x3061) {
// add small 'Tsu' (before normal one)
AddKanaMap (0x3063, 1);
fillIndex [0x22] = 0x97;
jisJapanese.Sort (JISComparer.Instance);
foreach (JISCharacter j in jisJapanese)
- if (0x3300 <= j.CP && j.CP <= 0x3357)
- AddCharMap ((char) j.CP, 0x22, 1);
+ AddCharMap ((char) j.CP, 0x22, 1);
// non-JIS Japanese square chars.
nonJisJapanese.Sort (NonJISComparer.Instance);
foreach (NonJISCharacter j in nonJisJapanese)
map [cp] = new CharMapEntry (0x24,
(byte) (map [cp - 1].Level1 + 2),
0);
- // FIXME: Syriac NonSpacingMark should go here.
// Thaana
// FIXME: it turned out that it does not look like UCA
fillIndex [0x24] = 0x6E;
for (int i = 0; i < orderedThaana.Length; i++) {
- char c = orderedThaana [i];
- if (IsIgnorableNonSpacing ((int) c))
+ if (IsIgnorableNonSpacing (i))
continue;
- AddCharMap (c, 0x24, 2);
- if (c == '\u0782') // SPECIAL CASE: why?
- fillIndex [0x24] += 2;
+ AddCharMap (orderedThaana [i], 0x24, 2);
}
#endregion
+ "<{\u1113 \u1116}, \u3165,"
+ "\u11C5, \u11C6=\u3166,, \u11C7, \u11C8,"
+ "\u11AC, \u11C9, \u11AD, \u1103=\u11AE >"
- + "<\u1117, \u11CA, \u1104, \u11CB > \u1105=\u11AF >"
+ + "<\u1117, \u11CA, \u1104, \u11CB > \u1105 >"
+ "<{\u1118 \u111B}, \u11B0, [\u11CC \u11D0], \u11B1,"
+ "[\u11D1 \u11D2], \u11B2,"
+ "[\u11D3 \u11D5], \u11B3,"
+ "\u11B6=\u11D8, \u3140,, \u11D9, \u1106=\u11B7 >"
+ "<{\u111C \u111D}, [\u11DA \u11E2], \u1107=\u11B8 >"
+ "<{\u111E \u1120}, \u3172,, \u3173, \u11E3, \u1108 >"
- + "<{\u1121 \u112C}, \u3144 \u11B9, \u3174, \u3175,,,, "
- + "\u3176,, \u3177, [\u11E4 \u11E6] \u3178,"
- + "\u3179, \u1109=\u11BA,,, \u3214=\u3274 <>"
- + "<{\u112D \u1133}, \u11E7 \u317A, \u317B, \u317C "
- + "[\u11E8 \u11E9],, \u11EA \u317D,, \u110A=\u11BB,,, >"
+ + "<{\u1121 \u112C}, \u11B9,,,,,,,,, [\u11E4 \u11E6],,"
+ + "\u1109=\u11BA,,, \u3214=\u3274 <>"
+ + "<{\u112D \u1133}, \u11E7,, [\u11E8 \u11E9],,"
+ + "\u11EA,, \u110A=\u11BB,,, >"
+ "<{\u1134 \u1140}, \u317E,,,,,,, \u11EB,"
+ "\u110B=\u11BC, [\u1161 \u11A2], \u1160 >"
+ "<{\u1141 \u114C}, \u11EE, \u11EC, \u11ED,,,,, "
}
}
- // Some Jamo NFKD.
- for (int i = 0x3200; i < 0x3300; i++) {
- if (IsIgnorable (i) || map [i].Defined)
- continue;
- int ch = 0;
- // w/ bracket
- if (decompLength [i] == 4 &&
- decompValues [decompIndex [i]] == '(')
- ch = decompIndex [i] + 1;
- // circled
- else if (decompLength [i] == 2 &&
- decompValues [decompIndex [i] + 1] == '\u1161')
- ch = decompIndex [i];
- else if (decompLength [i] == 1)
- ch = decompIndex [i];
- else
- continue;
- ch = decompValues [ch];
- if (ch < 0x1100 || 0x1200 < ch &&
- ch < 0xAC00 || 0xD800 < ch)
- continue;
-
- // SPECIAL CASE ?
- int offset = i < 0x3260 ? 1 : 0;
- if (0x326E <= i && i <= 0x3273)
- offset = 1;
-
- map [i] = new CharMapEntry (map [ch].Category,
- (byte) (map [ch].Level1 + offset),
- map [ch].Level2);
-// Console.Error.WriteLine ("Jamo {0:X04} -> {1:X04}", i, decompValues [decompIndex [i] + 1]);
- }
-
-
#endregion
// Letterlike characters and CJK compatibility square
if (IsIgnorable (i))
continue;
- // FIXME: actually those reset should not be
- // done but here I put for easy goal.
- if (i == 0x0700)
- fillIndex [0x7] = 0xE2;
- if (i == 0x2016)
- fillIndex [0x7] = 0x77;
-
// SPECIAL CASES:
switch (i) {
case 0xAB: // 08
case 0xB7: // 0A
- case 0xBB: // 08
case 0x2329: // 09
case 0x232A: // 09
continue;
}
}
// Control pictures
- // FIXME: it should not need to reset level 1, but
- // it's for easy goal.
- fillIndex [0x7] = 0xB6;
for (int i = 0x2400; i <= 0x2421; i++)
AddCharMap ((char) i, 0x7, 1, 0);
#endregion
// FIXME: for 07 xx we need more love.
+ // FIXME: 08 should be more complete.
+ fillIndex [0x8] = 2;
+ for (int cp = 0; cp < char.MaxValue; cp++)
+ if (!map [cp].Defined &&
+ Char.GetUnicodeCategory ((char) cp) ==
+ UnicodeCategory.MathSymbol)
+ AddCharMapGroup ((char) cp, 0x8, 1, 0);
+
// Characters w/ diacritical marks (NFKD)
for (int i = 0; i <= char.MaxValue; i++) {
if (map [i].Defined || IsIgnorable (i))
}
- // category 08 - symbols
- fillIndex [0x8] = 2;
- // Here Windows mapping is not straightforward. It is
- // not based on computation but seems manual sorting.
- AddCharMapGroup ('+', 0x8, 1, 0); // plus
- AddCharMapGroup ('\u2212', 0x8, 1, 0); // minus
- AddCharMapGroup ('\u229D', 0x8, 1, 0); // minus
- AddCharMapGroup ('\u2297', 0x8, 1, 0); // mul
- AddCharMapGroup ('\u2044', 0x8, 1, 0); // div
- AddCharMapGroup ('\u2215', 0x8, 1, 0); // div
- AddCharMapGroup ('\u2217', 0x8, 1, 0); // mul
- AddCharMapGroup ('\u2218', 0x8, 1, 0); // ring
- AddCharMapGroup ('\u2219', 0x8, 1, 0); // bullet
- AddCharMapGroup ('\u2213', 0x8, 1, 0); // minus-or-plus
- AddCharMapGroup ('\u003C', 0x8, 1, 0); // <
- AddCharMapGroup ('\u227A', 0x8, 1, 0); // precedes relation
- AddCharMapGroup ('\u22B0', 0x8, 1, 0); // precedes under relation
-
- for (int cp = 0; cp < 0x2300; cp++) {
- if (cp == 0x200)
- cp = 0x2200; // skip to 2200
- if (cp == 0xAC) // SPECIAL CASE: skip
- continue;
- if (!map [cp].Defined &&
-// Char.GetUnicodeCategory ((char) cp) ==
-// UnicodeCategory.MathSymbol)
- Char.IsSymbol ((char) cp))
- AddCharMapGroup ((char) cp, 0x8, 1, 0);
- // SPECIAL CASES: no idea why Windows sorts as such
- switch (cp) {
- case 0x3E:
- AddCharMap ('\u227B', 0x8, 1, 0);
- AddCharMap ('\u22B1', 0x8, 1, 0);
- break;
- case 0xB1:
- AddCharMapGroup ('\u00AB', 0x8, 1, 0);
- AddCharMapGroup ('\u226A', 0x8, 1, 0);
- AddCharMapGroup ('\u00BB', 0x8, 1, 0);
- AddCharMapGroup ('\u226B', 0x8, 1, 0);
- break;
- case 0xF7:
- AddCharMap ('\u01C0', 0x8, 1, 0);
- AddCharMap ('\u01C1', 0x8, 1, 0);
- AddCharMap ('\u01C2', 0x8, 1, 0);
- break;
- }
- }
-
#region Level2 adjustment
// Arabic Hamzah
diacritical [0x624] = 0x5;
diacritical [0x649] = 0x5; // 'alif maqs.uurah
diacritical [0x64A] = 0x7; // Yaa'
+
for (int i = 0; i < char.MaxValue; i++) {
byte mod = 0;
byte cat = map [i].Category;
mod = diacritical [i];
break;
case 0x13: // Arabic
- if (diacritical [i] == 0 && i >= 0xFE8D)
+ if (diacritical [i] == 0)
mod = 0x8; // default for arabic
break;
}
}
#endregion
- // FIXME: this is hack but those NonSpacingMark
- // characters and still undefined are likely to
- // be nonspacing.
+ // FIXME: this is hack but those which are
+ // NonSpacingMark characters and still undefined
+ // are likely to be nonspacing.
for (int i = 0; i < char.MaxValue; i++)
if (!map [i].Defined &&
!IsIgnorable (i) &&
Char.GetUnicodeCategory ((char) i) ==
UnicodeCategory.NonSpacingMark)
AddCharMap ((char) i, 1, 1);
-
- // FIXME: this is hack but those Symbol characters
- // are likely to fall into 0xA category.
- for (int i = 0; i < char.MaxValue; i++)
- if (!map [i].Defined &&
- !IsIgnorable (i) &&
- Char.IsSymbol ((char) i))
- AddCharMap ((char) i, 0xA, 1);
}
private void IncrementSequentialIndex (ref byte hangulCat)
AddCharMapCJK (c, ref category);
// LAMESPEC: see below.
- if (c == '\u5B78') {
- AddCharMapCJK ('\u32AB', ref category);
- AddCharMapCJK ('\u323B', ref category);
- }
if (c == '\u52DE') {
AddCharMapCJK ('\u3298', ref category);
AddCharMapCJK ('\u3238', ref category);
// mix Chinise and Japanese Kanji when
// ordering those characters.
switch (w) {
- case 0x32A2: case 0x3298: case 0x3238:
- case 0x32A9: case 0x323B: case 0x32AB:
+ case 0x32A2: case 0x3298: case 0x3238: case 0x32A9:
continue;
}
AddCharMap (vertical, category, updateCount, level2);
}
- private void AddArabicCharMap (char c)
- {
- byte category = 6;
- byte updateCount = 1;
- byte level2 = 0;
-
- // itself
- AddCharMap (c, category, 0, level2);
-
- // Since nfkdMap is problematic to have two or more
- // NFKD to an identical character, here I iterate all.
- for (int c2 = 0; c2 < char.MaxValue; c2++) {
- if (decompLength [c2] == 0)
- continue;
- int idx = decompIndex [c2] + decompLength [c2] - 1;
- if ((int) (decompValues [idx]) == (int) c)
- AddCharMap ((char) c2, category,
- 0, level2);
- }
- fillIndex [category] += updateCount;
- }
-
char ToFullWidth (char c)
{
return ToDecomposed (c, DecompositionFull, false);
private byte ComputeLevel3WeightRaw (char c) // add 2 for sortkey value
{
- // CJK compat
- if ('\u3192' <= c && c <= '\u319F')
- return 0;
- // Japanese reading marks
- if (c == '\u3001' || c == '\u3002')
- return 2;
// Korean
if ('\u11A8' <= c && c <= '\u11F9')
return 2;
return 4;
if ('\u3130' <= c && c <= '\u3164')
return 5;
- if ('\u3165' <= c && c <= '\u318E')
- return 4;
- // Georgian Capital letters
- if ('\u10A0' <= c && c <= '\u10C5')
- return 0x10;
// numbers
if ('\u2776' <= c && c <= '\u277F')
return 4;
if ('\u2776' <= c && c <= '\u2793')
return 0xC;
if ('\u2160' <= c && c <= '\u216F')
- return 0x10;
+ return 0x18;
if ('\u2181' <= c && c <= '\u2182')
return 0x18;
// Arabic
if ('\u2135' <= c && c <= '\u2138')
return 4;
- if ('\uFE80' <= c && c < '\uFF00') {
+ if ('\uFE80' <= c && c < '\uFE8E') {
// 2(Isolated)/8(Final)/0x18(Medial)
switch (decompType [(int) c]) {
case DecompositionIsolated:
// those ranges.
case 0x4d8: case 0x4d9:
case 0x4e8: case 0x4e9:
- case 0x70F:
case 0x3036: case 0x303f:
case 0x337b: case 0xfb1e:
return false;
{
JISCharacter j1 = (JISCharacter) o1;
JISCharacter j2 = (JISCharacter) o2;
- return j1.JIS - j2.JIS;
+ return j2.JIS - j1.JIS;
}
}
for (int i = 0; i < Source.Length; i++)
ret [i + 1] = Source [i];
// null terminate
- for (int i = 0; i < 4; i++)
+ for (int i = 0; i < 5; i++)
ret [i + Source.Length + 2] = (char) SortKey [i];
return ret;
}