// and of course, defined sortkey value itself), comparison cannot be done
// at source char[] level.
//
-// (to be continued.)
+// In IndexOf(), the first character in the target string or the target char
+// itself is turned into sortkey bytes. If the character has a contraction and
+// that is sortkey map, then it is used instead. If the contraction exists and
+// that is replacement map, then the first character of the replacement string
+// is searched instead. IndexOf() always searches only for the top character,
+// and if it returned negative value, then it returns -1. Otherwise, it then
+// tries IsPrefix() from that location. If it returns true, then it returns
+// the index.
//
+// LAMESPEC: IndexOf() is lame as a whole API. It never matches in the middle
+// of expansion and there is no proper way to return such indexes within
+// a single int return value.
+//
+// For example, try below in .NET:
+// IndexOf("\u00E6", "a")
+// IndexOf("\u00E6", "e")
+//
+
using System;
using System.Collections;
bool ignoreKanaType;
TextInfo textInfo; // for ToLower().
bool frenchSort;
- readonly ushort [] cjkTable;
+ unsafe readonly byte* cjkCatTable;
+ unsafe readonly byte* cjkLv1Table;
readonly CodePointIndexer cjkIndexer;
- readonly byte [] cjkLv2Table;
+ unsafe readonly byte* cjkLv2Table;
readonly CodePointIndexer cjkLv2Indexer;
readonly int lcid;
readonly Contraction [] contractions;
readonly Level2Map [] level2Maps;
- byte [] charSortKey = new byte [4];
-
- #region Tailoring support classes
- // Possible mapping types are:
- //
- // - string to string (ReplacementMap)
- // - string to SortKey (SortKeyMap)
- // - diacritical byte to byte (DiacriticalMap)
- //
- // There could be mapping from string to sortkeys, but
- // for now there is none as such.
- //
- internal class Contraction
- {
- public readonly char [] Source;
- // only either of them is used.
- public readonly string Replacement;
- public readonly byte [] SortKey;
-
- public Contraction (char [] source,
- string replacement, byte [] sortkey)
- {
- Source = source;
- Replacement = replacement;
- SortKey = sortkey;
- }
- }
- internal class ContractionComparer : IComparer
- {
- public static readonly ContractionComparer Instance =
- new ContractionComparer ();
-
- public int Compare (object o1, object o2)
- {
- Contraction c1 = (Contraction) o1;
- Contraction c2 = (Contraction) o2;
- char [] a1 = c1.Source;
- char [] a2 = c2.Source;
- int min = a1.Length > a2.Length ?
- a2.Length : a1.Length;
- for (int i = 0; i < min; i++)
- if (a1 [i] != a2 [i])
- return a1 [i] - a2 [i];
- return a1.Length - a2.Length;
- }
- }
-
- internal class Level2Map
- {
- public byte Source;
- public byte Replace;
-
- public Level2Map (byte source, byte replace)
- {
- Source = source;
- Replace = replace;
- }
- }
-
- internal class Level2MapComparer : IComparer
- {
- public static readonly Level2MapComparer Instance =
- new Level2MapComparer ();
+ // This flag marks characters as "unsafe", where the character
+ // could be used as part of a contraction (whose length > 1).
+ readonly bool [] unsafeFlags;
- public int Compare (object o1, object o2)
- {
- Level2Map m1 = (Level2Map) o1;
- Level2Map m2 = (Level2Map) o2;
- return (m1.Source - m2.Source);
- }
- }
+ const int UnsafeFlagLength = 0x300;
- #endregion
+ // temporary sortkey buffer for index search/comparison
+ byte [] charSortKey = new byte [4];
+ byte [] charSortKey2 = new byte [4];
+ byte [] charSortKeyIndexTarget = new byte [4];
#region .ctor() and split functions
textInfo = culture.TextInfo;
buf = new SortKeyBuffer (culture.LCID);
- SetCJKTable (culture, ref cjkTable, ref cjkIndexer,
- ref cjkLv2Table, ref cjkLv2Indexer);
+ unsafe {
+ SetCJKTable (culture, ref cjkIndexer,
+ ref cjkCatTable, ref cjkLv1Table,
+ ref cjkLv2Indexer, ref cjkLv2Table);
+ }
// Get tailoring info
TailoringInfo t = null;
t = Uni.GetTailoringInfo (127);
frenchSort = t.FrenchSort;
- BuildTailoringTables (culture, t, ref contractions,
+ Uni.BuildTailoringTables (culture, t, ref contractions,
ref level2Maps);
+ unsafeFlags = new bool [UnsafeFlagLength];
+ foreach (Contraction c in contractions)
+ if (c.Source.Length > 1)
+ foreach (char ch in c.Source)
+ unsafeFlags [(int) ch] = true;
+
// FIXME: Since tailorings are mostly for latin
// (and in some cases Cyrillic) characters, it would
// be much better for performance to store "start
foreach (Contraction c in contractions) {
foreach (char cc in c.Source)
Console.Write ("{0:X4} ", (int) cc);
-Console.WriteLine (" -> {0}", c.Replacement);
+Console.WriteLine (" -> '{0}'", c.Replacement);
}
*/
}
- private void BuildTailoringTables (CultureInfo culture,
- TailoringInfo t,
- ref Contraction [] contractions,
- ref Level2Map [] diacriticals)
+ unsafe private void SetCJKTable (
+ CultureInfo culture, ref CodePointIndexer cjkIndexer,
+ ref byte* catTable, ref byte* lv1Table,
+ ref CodePointIndexer lv2Indexer, ref byte* lv2Table)
{
- // collect tailoring entries.
- ArrayList cmaps = new ArrayList ();
- ArrayList dmaps = new ArrayList ();
- char [] tarr = Uni.TailoringValues;
- int idx = t.TailoringIndex;
- int end = idx + t.TailoringCount;
- while (idx < end) {
- int ss = idx + 1;
- char [] src = null;
- switch (tarr [idx]) {
- case '\x1': // SortKeyMap
- idx++;
- while (tarr [ss] != 0)
- ss++;
- src = new char [ss - idx];
- Array.Copy (tarr, idx, src, 0, ss - idx);
- byte [] sortkey = new byte [4];
- for (int i = 0; i < 4; i++)
- sortkey [i] = (byte) tarr [ss + 1 + i];
- cmaps.Add (new Contraction (
- src, null, sortkey));
- // it ends with 0
- idx = ss + 6;
- break;
- case '\x2': // DiacriticalMap
- dmaps.Add (new Level2Map (
- (byte) tarr [idx + 1],
- (byte) tarr [idx + 2]));
- idx += 3;
- break;
- case '\x3': // ReplacementMap
- idx++;
- while (tarr [ss] != 0)
- ss++;
- src = new char [ss - idx];
- Array.Copy (tarr, idx, src, 0, ss - idx);
- int l = ss + 1;
- while (tarr [l] != 0)
- l++;
- string r = new string (tarr, ss, l - ss);
- cmaps.Add (new Contraction (
- src, r, null));
- idx = l + 1;
- break;
- default:
- throw new NotImplementedException (String.Format ("Mono INTERNAL ERROR (Should not happen): Collation tailoring table is broken for culture {0} ({1}) at 0x{2:X}", culture.LCID, culture.Name, idx));
- }
- }
- cmaps.Sort (ContractionComparer.Instance);
- dmaps.Sort (Level2MapComparer.Instance);
- contractions = cmaps.ToArray (typeof (Contraction))
- as Contraction [];
- diacriticals = dmaps.ToArray (typeof (Level2Map))
- as Level2Map [];
- }
-
- private void SetCJKTable (CultureInfo culture,
- ref ushort [] cjkTable, ref CodePointIndexer cjkIndexer,
- ref byte [] cjkLv2Table, ref CodePointIndexer cjkLv2Indexer)
- {
- // custom CJK table support.
- switch (GetNeutralCulture (culture).Name) {
- case "zh-CHS":
- cjkTable = Uni.CjkCHS;
- cjkIndexer = UUtil.CjkCHS;
- break;
- case "zh-CHT":
- cjkTable = Uni.CjkCHT;
- cjkIndexer = UUtil.Cjk;
- break;
- case "ja":
- cjkTable = Uni.CjkJA;
- cjkIndexer = UUtil.Cjk;
- break;
- case "ko":
- cjkTable = Uni.CjkKO;
- cjkLv2Table = Uni.CjkKOLv2;
- cjkIndexer = UUtil.Cjk;
- cjkLv2Indexer = UUtil.Cjk;
- break;
- }
+ string name = GetNeutralCulture (culture).Name;
+
+ Uni.FillCJK (name, ref cjkIndexer, ref catTable,
+ ref lv1Table, ref lv2Indexer, ref lv2Table);
}
static CultureInfo GetNeutralCulture (CultureInfo info)
#endregion
- byte Category (int cp)
+ unsafe byte Category (int cp)
{
- if (cp < 0x3000 || cjkTable == null)
- return Uni.Categories (cp);
- ushort cjk = cjkTable [cjkIndexer.ToIndex (cp)];
- return cjk != 0 ? (byte) ((cjk & 0xFF00) >> 8) :
- Uni.Categories (cp);
+ if (cp < 0x3000 || cjkCatTable == null)
+ return Uni.Category (cp);
+ int idx = cjkIndexer.ToIndex (cp);
+ return idx < 0 ? Uni.Category (cp) : cjkCatTable [idx];
}
- byte Level1 (int cp)
+ unsafe byte Level1 (int cp)
{
- if (cp < 0x3000 || cjkTable == null)
+ if (cp < 0x3000 || cjkLv1Table == null)
return Uni.Level1 (cp);
- ushort cjk = cjkTable [cjkIndexer.ToIndex (cp)];
- return cjk != 0 ? (byte) (cjk & 0xFF) : Uni.Level1 (cp);
+ int idx = cjkIndexer.ToIndex (cp);
+ return idx < 0 ? Uni.Level1 (cp) : cjkLv1Table [idx];
}
- byte Level2 (int cp)
+ unsafe byte Level2 (int cp, ExtenderType ext)
{
+ if (ext == ExtenderType.Buggy)
+ return 5;
+ else if (ext == ExtenderType.Conditional)
+ return 0;
+
if (cp < 0x3000 || cjkLv2Table == null)
return Uni.Level2 (cp);
- byte ret = cjkLv2Table [cjkLv2Indexer.ToIndex (cp)];
+ int idx = cjkLv2Indexer.ToIndex (cp);
+ byte ret = idx < 0 ? (byte) 0 : cjkLv2Table [idx];
if (ret != 0)
return ret;
ret = Uni.Level2 (cp);
return ret;
}
+ bool IsHalfKana (int cp)
+ {
+ return ignoreWidth || Uni.IsHalfWidthKana ((char) cp);
+ }
+
void SetOptions (CompareOptions options)
{
this.ignoreNonSpace = (options & CompareOptions.IgnoreNonSpace) != 0;
this.ignoreWidth = (options & CompareOptions.IgnoreWidth) != 0;
this.ignoreCase = (options & CompareOptions.IgnoreCase) != 0;
this.ignoreKanaType = (options & CompareOptions.IgnoreKanaType) != 0;
+ previousChar = previousChar2 = -1;
+ previousSortKey = previousSortKey2 = null;
+ escape1.Source = escape2.Source = null;
}
Contraction GetContraction (string s, int start, int end)
{
for (int i = 0; i < clist.Length; i++) {
Contraction ct = clist [i];
- if (ct.Source [0] > s [start])
+ int diff = ct.Source [0] - s [start];
+ if (diff > 0)
return null; // it's already sorted
+ else if (diff < 0)
+ continue;
char [] chars = ct.Source;
if (end - start < chars.Length)
continue;
return null;
}
+ Contraction GetTailContraction (string s, int start, int end)
+ {
+ Contraction c = GetTailContraction (s, start, end, contractions);
+ if (c != null || lcid == 127)
+ return c;
+ return GetTailContraction (s, start, end, invariant.contractions);
+ }
+
+ Contraction GetTailContraction (string s, int start, int end, Contraction [] clist)
+ {
+ for (int i = 0; i < clist.Length; i++) {
+ Contraction ct = clist [i];
+ int diff = ct.Source [0] - s [end];
+ if (diff > 0)
+ return null; // it's already sorted
+ else if (diff < 0)
+ continue;
+ char [] chars = ct.Source;
+ if (start - end + 1 < chars.Length)
+ continue;
+ bool match = true;
+ int offset = start - chars.Length + 1;
+ for (int n = 0; n < chars.Length; n++)
+ if (s [offset + n] != chars [n]) {
+ match = false;
+ break;
+ }
+ if (match)
+ return ct;
+ }
+ return null;
+ }
+
Contraction GetContraction (char c)
{
Contraction ct = GetContraction (c, contractions);
return null;
}
- // FIXME: It should not be used, since it disregards both
- // sortkey maps and replacement map from two or more chars.
- string GetExpansion (int i)
- {
- return Uni.GetExpansion ((char) i);
- }
-
int FilterOptions (int i)
{
- if (ignoreWidth)
- i = Uni.ToWidthCompat (i);
+ if (ignoreWidth) {
+ int x = Uni.ToWidthCompat (i);
+ if (x != 0)
+ i = x;
+ }
if (ignoreCase)
i = textInfo.ToLower ((char) i);
if (ignoreKanaType)
return i;
}
+ int previousChar = -1;
+ byte [] previousSortKey = null;
+ int previousChar2 = -1;
+ byte [] previousSortKey2 = null;
+
+ enum ExtenderType {
+ None,
+ Simple,
+ Voiced,
+ Conditional,
+ Buggy,
+ }
+
+ ExtenderType GetExtenderType (int i)
+ {
+ // LAMESPEC: Windows expects true for U+3005, but
+ // sometimes it does not represent to repeat just
+ // one character.
+ // Windows also expects true for U+3031 and U+3032,
+ // but they should *never* repeat one character.
+
+ // U+2015 becomes an extender only when it is Japanese
+ if (i == 0x2015)
+ return lcid == 16 ? ExtenderType.Conditional :
+ ExtenderType.None;
+
+ if (i < 0x3005 || i > 0xFF70)
+ return ExtenderType.None;
+ if (i >= 0xFE7C) {
+ switch (i) {
+ case 0xFE7C:
+ case 0xFE7D:
+ return ExtenderType.Simple;
+ case 0xFF70:
+ return ExtenderType.Conditional;
+ case 0xFF9E:
+ case 0xFF9F:
+ return ExtenderType.Voiced;
+ }
+ }
+ if (i > 0x30FE)
+ return ExtenderType.None;
+ switch (i) {
+ case 0x3005: // LAMESPEC: see above.
+ return ExtenderType.Buggy;
+ case 0x3031: // LAMESPEC: see above.
+ case 0x3032: // LAMESPEC: see above.
+ case 0x309D:
+ case 0x30FD:
+ return ExtenderType.Simple;
+ case 0x309E:
+ case 0x30FE:
+ return ExtenderType.Voiced;
+ case 0x30FC:
+ return ExtenderType.Conditional;
+ default:
+ return ExtenderType.None;
+ }
+ }
+
+ byte ToDashTypeValue (ExtenderType ext)
+ {
+ if (ignoreNonSpace) // LAMESPEC: huh, why?
+ return 3;
+ switch (ext) {
+ case ExtenderType.None:
+ return 3;
+ case ExtenderType.Conditional:
+ return 5;
+ default:
+ return 4;
+ }
+ }
+
+ int FilterExtender (int i, ExtenderType ext)
+ {
+ if (ext == ExtenderType.Conditional &&
+ Uni.HasSpecialWeight ((char) i)) {
+ bool half = IsHalfKana ((char) i);
+ bool katakana = !Uni.IsHiragana ((char) i);
+ switch (Level1 (i) & 7) {
+ case 2:
+ return half ? 0xFF71 : katakana ? 0x30A2 : 0x3042;
+ case 3:
+ return half ? 0xFF72 : katakana ? 0x30A4 : 0x3044;
+ case 4:
+ return half ? 0xFF73 : katakana ? 0x30A6 : 0x3046;
+ case 5:
+ return half ? 0xFF74 : katakana ? 0x30A8 : 0x3048;
+ case 6:
+ return half ? 0xFF75 : katakana ? 0x30AA : 0x304A;
+ }
+ }
+ return i;
+ }
+
+ bool IsIgnorable (int i)
+ {
+ return Uni.IsIgnorable (i) ||
+ ignoreSymbols && Uni.IsIgnorableSymbol (i) ||
+ ignoreNonSpace && Uni.IsIgnorableNonSpacing (i);
+ }
+
+ bool IsSafe (int i)
+ {
+ return i >= unsafeFlags.Length ? true : !unsafeFlags [i];
+ }
+
#region GetSortKey()
public SortKey GetSortKey (string s)
{
SetOptions (options);
- buf.Initialize (options, s, frenchSort);
+ buf.Initialize (options, lcid, s, frenchSort);
int end = start + length;
GetSortKey (s, start, end);
return buf.GetResultAndReset ();
{
for (int n = start; n < end; n++) {
int i = s [n];
+
+ ExtenderType ext = GetExtenderType (i);
+ if (ext != ExtenderType.None) {
+ i = FilterExtender (previousChar, ext);
+ if (i >= 0)
+ FillSortKeyRaw (i, ext);
+ else if (previousSortKey != null) {
+ byte [] b = previousSortKey;
+ buf.AppendNormal (
+ b [0],
+ b [1],
+ b [2] != 1 ? b [2] : Level2 (i, ext),
+ b [3] != 1 ? b [3] : Uni.Level3 (i));
+ }
+ // otherwise do nothing.
+ // (if the extender is the first char
+ // in the string, then just ignore.)
+ continue;
+ }
+
if (IsIgnorable (i))
continue;
i = FilterOptions (i);
Contraction ct = GetContraction (s, n, end);
if (ct != null) {
- if (ct.Replacement != null)
+ if (ct.Replacement != null) {
GetSortKey (ct.Replacement, 0, ct.Replacement.Length);
- else {
+ } else {
byte [] b = ct.SortKey;
buf.AppendNormal (
b [0],
b [1],
- b [2] != 1 ? b [2] : Level2 (i),
+ b [2] != 1 ? b [2] : Level2 (i, ext),
b [3] != 1 ? b [3] : Uni.Level3 (i));
+ previousSortKey = b;
+ previousChar = -1;
}
n += ct.Source.Length - 1;
}
- else
- FillSortKeyRaw (i);
+ else {
+ if (!Uni.IsIgnorableNonSpacing (i))
+ previousChar = i;
+ FillSortKeyRaw (i, ExtenderType.None);
+ }
}
}
- bool IsIgnorable (int i)
- {
- return Uni.IsIgnorable (i) ||
- ignoreSymbols && Uni.IsIgnorableSymbol (i);
- }
-
- void FillSortKeyRaw (int i)
+ void FillSortKeyRaw (int i, ExtenderType ext)
{
if (0x3400 <= i && i <= 0x4DB5) {
int diff = i - 0x3400;
return;
}
- if (Uni.HasSpecialWeight ((char) i))
+ byte level2 = Level2 (i, ext);
+ if (Uni.HasSpecialWeight ((char) i)) {
+ byte level1 = Level1 (i);
buf.AppendKana (
Category (i),
- Level1 (i),
- Level2 (i),
+ level1,
+ level2,
Uni.Level3 (i),
Uni.IsJapaneseSmallLetter ((char) i),
- Uni.GetJapaneseDashType ((char) i),
+ ToDashTypeValue (ext),
!Uni.IsHiragana ((char) i),
- Uni.IsHalfWidthKana ((char) i)
+ IsHalfKana ((char) i)
);
+ if (!ignoreNonSpace && ext == ExtenderType.Voiced)
+ // Append voice weight
+ buf.AppendNormal (1, 1, 1, 0);
+ }
else
buf.AppendNormal (
Category (i),
Level1 (i),
- Level2 (i),
+ level2,
Uni.Level3 (i));
}
return Compare (s1, 0, s1.Length, s2, 0, s2.Length, options);
}
+ class Escape
+ {
+ public string Source;
+ public int Index;
+ public int Start;
+ public int End;
+ public int Optional;
+ }
+
+ // Those instances are reused not to invoke instantiation
+ // during Compare().
+ Escape escape1 = new Escape ();
+ Escape escape2 = new Escape ();
+
+ private int CompareOrdinal (string s1, int idx1, int len1,
+ string s2, int idx2, int len2)
+ {
+ int min = len1 < len2 ? len1 : len2;
+ int end1 = idx1 + min;
+ int end2 = idx2 + min;
+ for (int i1 = idx1, i2 = idx2;
+ i1 < end1 && i2 < end2; i1++, i2++)
+ if (s1 [i1] != s2 [i2])
+ return s1 [i1] - s2 [i2];
+ return len1 == len2 ? 0 :
+ len1 == min ? - 1 : 1;
+ }
+
public int Compare (string s1, int idx1, int len1,
string s2, int idx2, int len2, CompareOptions options)
{
+ // quick equality check
+ if (idx1 == idx2 && len1 == len2 &&
+ Object.ReferenceEquals (s1, s2))
+ return 0;
+ // FIXME: this should be done inside Compare() at
+ // any time.
+// int ord = CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
+// if (ord == 0)
+// return 0;
+ if (options == CompareOptions.Ordinal)
+ return CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
+
+#if false // stable easy version, depends on GetSortKey().
SortKey sk1 = GetSortKey (s1, idx1, len1, options);
SortKey sk2 = GetSortKey (s2, idx2, len2, options);
byte [] d1 = sk1.KeyData;
if (d1 [i] != d2 [i])
return d1 [i] < d2 [i] ? -1 : 1;
return d1.Length == d2.Length ? 0 : d1.Length < d2.Length ? -1 : 1;
+#else
+ SetOptions (options);
+ bool dummy, dummy2;
+ int ret = CompareInternal (s1, idx1, len1, s2, idx2, len2, (options & CompareOptions.StringSort) != 0, out dummy, out dummy2, true);
+ return ret == 0 ? 0 : ret < 0 ? -1 : 1;
+#endif
+ }
+
+ int CompareInternal (string s1, int idx1, int len1, string s2,
+ int idx2, int len2, bool stringSort,
+ out bool targetConsumed, out bool sourceConsumed,
+ bool skipHeadingExtenders)
+ {
+ int start1 = idx1;
+ int start2 = idx2;
+ int end1 = idx1 + len1;
+ int end2 = idx2 + len2;
+ targetConsumed = false;
+ sourceConsumed = false;
+
+ // It holds final result that comes from the comparison
+ // at level 2 or lower. Even if Compare() found the
+ // difference at level 2 or lower, it still has to
+ // continue level 1 comparison. FinalResult is used
+ // when there was no further differences.
+ int finalResult = 0;
+ // It holds the comparison level to do. It starts from
+ // 5, and becomes 1 when we do primary-only comparison.
+ int currentLevel = 5;
+
+ int lv5At1 = -1;
+ int lv5At2 = -1;
+ int lv5Value1 = 0;
+ int lv5Value2 = 0;
+
+ // Skip heading extenders
+ if (skipHeadingExtenders) {
+ for (; idx1 < end1; idx1++)
+ if (GetExtenderType (s1 [idx1]) == ExtenderType.None)
+ break;
+ for (; idx2 < end2; idx2++)
+ if (GetExtenderType (s2 [idx2]) == ExtenderType.None)
+ break;
+ }
+
+ ExtenderType ext1 = ExtenderType.None;
+ ExtenderType ext2 = ExtenderType.None;
+
+ int quickCheckPos1 = idx1;
+ int quickCheckPos2 = idx2;
+
+ while (true) {
+ for (; idx1 < end1; idx1++)
+ if (!IsIgnorable (s1 [idx1]))
+ break;
+ for (; idx2 < end2; idx2++)
+ if (!IsIgnorable (s2 [idx2]))
+ break;
+
+ if (idx1 >= end1) {
+ if (escape1.Source == null)
+ break;
+ s1 = escape1.Source;
+ start1 = escape1.Start;
+ idx1 = escape1.Index;
+ end1 = escape1.End;
+ quickCheckPos1 = escape1.Optional;
+ escape1.Source = null;
+ continue;
+ }
+ if (idx2 >= end2) {
+ if (escape2.Source == null)
+ break;
+ s2 = escape2.Source;
+ start2 = escape2.Start;
+ idx2 = escape2.Index;
+ end2 = escape2.End;
+ quickCheckPos2 = escape2.Optional;
+ escape2.Source = null;
+ continue;
+ }
+#if true
+// If comparison is unstable, then this part is one of the most doubtful part.
+// Here we run quick codepoint comparison and run back to "stable" area to
+// compare characters.
+
+ // Strictly to say, even the first character
+ // could be compared here, but it messed
+ // backward step, so I just avoided mess.
+ if (quickCheckPos1 < idx1 && quickCheckPos2 < idx2) {
+ while (idx1 < end1 && idx2 < end2 &&
+ s1 [idx1] == s2 [idx2]) {
+ idx1++;
+ idx2++;
+ }
+ if (idx1 == end1 || idx2 == end2)
+ continue; // check replacement
+
+ int backwardEnd1 = quickCheckPos1;
+ int backwardEnd2 = quickCheckPos2;
+ quickCheckPos1 = idx1;
+ quickCheckPos2 = idx2;
+
+ idx1--;
+ idx2--;
+
+ for (;idx1 > backwardEnd1; idx1--)
+ if (Category (s1 [idx1]) != 1)
+ break;
+ for (;idx2 > backwardEnd2; idx2--)
+ if (Category (s2 [idx2]) != 1)
+ break;
+ for (;idx1 > backwardEnd1; idx1--)
+ if (IsSafe (s1 [idx1]))
+ break;
+ for (;idx2 > backwardEnd2; idx2--)
+ if (IsSafe (s2 [idx2]))
+ break;
+ }
+#endif
+
+ int cur1 = idx1;
+ int cur2 = idx2;
+ byte [] sk1 = null;
+ byte [] sk2 = null;
+ int i1 = FilterOptions (s1 [idx1]);
+ int i2 = FilterOptions (s2 [idx2]);
+ bool special1 = false;
+ bool special2 = false;
+
+ // If current character is an expander, then
+ // repeat the previous character.
+ ext1 = GetExtenderType (i1);
+ if (ext1 != ExtenderType.None) {
+ if (previousChar < 0) {
+ if (previousSortKey == null) {
+ // nothing to extend
+ idx1++;
+ continue;
+ }
+ sk1 = previousSortKey;
+ }
+ else
+ i1 = FilterExtender (previousChar, ext1);
+ }
+ ext2 = GetExtenderType (i2);
+ if (ext2 != ExtenderType.None) {
+ if (previousChar2 < 0) {
+ if (previousSortKey2 == null) {
+ // nothing to extend
+ idx2++;
+ continue;
+ }
+ sk2 = previousSortKey2;
+ }
+ else
+ i2 = FilterExtender (previousChar2, ext2);
+ }
+
+ byte cat1 = Category (i1);
+ byte cat2 = Category (i2);
+
+ // Handle special weight characters
+ if (!stringSort && currentLevel > 4) {
+ if (cat1 == 6) {
+ lv5At1 = escape1.Source != null ?
+ escape1.Index - escape1.Start :
+ cur1 - start1;
+ // here Windows has a bug that it does
+ // not consider thirtiary weight.
+ lv5Value1 = Level1 (i1) << 8 + Uni.Level3 (i1);
+ previousChar = i1;
+ idx1++;
+ }
+ if (cat2 == 6) {
+ lv5At2 = escape2.Source != null ?
+ escape2.Index - escape2.Start :
+ cur2 - start2;
+ // here Windows has a bug that it does
+ // not consider thirtiary weight.
+ lv5Value2 = Level1 (i2) << 8 + Uni.Level3 (i2);
+ previousChar2 = i2;
+ idx2++;
+ }
+ if (cat1 == 6 || cat2 == 6) {
+ currentLevel = 4;
+ continue;
+ }
+ }
+
+ Contraction ct1 = null;
+ if (ext1 == ExtenderType.None)
+ ct1 = GetContraction (s1, idx1, end1);
+
+ int offset1 = 1;
+ if (sk1 != null)
+ offset1 = 1;
+ else if (ct1 != null) {
+ offset1 = ct1.Source.Length;
+ if (ct1.SortKey != null) {
+ sk1 = charSortKey;
+ for (int i = 0; i < ct1.SortKey.Length; i++)
+ sk1 [i] = ct1.SortKey [i];
+ previousChar = -1;
+ previousSortKey = sk1;
+ }
+ else if (escape1.Source == null) {
+ escape1.Source = s1;
+ escape1.Start = start1;
+ escape1.Index = cur1 + ct1.Source.Length;
+ escape1.End = end1;
+ escape1.Optional = quickCheckPos1;
+ s1 = ct1.Replacement;
+ idx1 = 0;
+ start1 = 0;
+ end1 = s1.Length;
+ quickCheckPos1 = 0;
+ continue;
+ }
+ }
+ else {
+ sk1 = charSortKey;
+ sk1 [0] = cat1;
+ sk1 [1] = Level1 (i1);
+ if (!ignoreNonSpace && currentLevel > 1)
+ sk1 [2] = Level2 (i1, ext1);
+ if (currentLevel > 2)
+ sk1 [3] = Uni.Level3 (i1);
+ if (currentLevel > 3)
+ special1 = Uni.HasSpecialWeight ((char) i1);
+ if (cat1 > 1)
+ previousChar = i1;
+ }
+
+ Contraction ct2 = null;
+ if (ext2 == ExtenderType.None)
+ ct2 = GetContraction (s2, idx2, end2);
+
+ if (sk2 != null)
+ idx2++;
+ else if (ct2 != null) {
+ idx2 += ct2.Source.Length;
+ if (ct2.SortKey != null) {
+ sk2 = charSortKey2;
+ for (int i = 0; i < ct2.SortKey.Length; i++)
+ sk2 [i] = ct2.SortKey [i];
+ previousChar2 = -1;
+ previousSortKey2 = sk2;
+ }
+ else if (escape2.Source == null) {
+ escape2.Source = s2;
+ escape2.Start = start2;
+ escape2.Index = cur2 + ct2.Source.Length;
+ escape2.End = end2;
+ escape2.Optional = quickCheckPos2;
+ s2 = ct2.Replacement;
+ idx2 = 0;
+ start2 = 0;
+ end2 = s2.Length;
+ quickCheckPos2 = 0;
+ continue;
+ }
+ }
+ else {
+ sk2 = charSortKey2;
+ sk2 [0] = cat2;
+ sk2 [1] = Level1 (i2);
+ if (!ignoreNonSpace && currentLevel > 1)
+ sk2 [2] = Level2 (i2, ext2);
+ if (currentLevel > 2)
+ sk2 [3] = Uni.Level3 (i2);
+ if (currentLevel > 3)
+ special2 = Uni.HasSpecialWeight ((char) i2);
+ if (cat2 > 1)
+ previousChar2 = i2;
+ idx2++;
+ }
+
+ // Add offset here so that it does not skip
+ // idx1 while s2 has a replacement.
+ idx1 += offset1;
+
+ // add diacritical marks in s1 here
+ if (!ignoreNonSpace) {
+ while (idx1 < end1) {
+ if (Category (s1 [idx1]) != 1)
+ break;
+ if (sk1 [2] == 0)
+ sk1 [2] = 2;
+ sk1 [2] = (byte) (sk1 [2] +
+ Level2 (s1 [idx1], ExtenderType.None));
+ idx1++;
+ }
+
+ // add diacritical marks in s2 here
+ while (idx2 < end2) {
+ if (Category (s2 [idx2]) != 1)
+ break;
+ if (sk2 [2] == 0)
+ sk2 [2] = 2;
+ sk2 [2] = (byte) (sk2 [2] +
+ Level2 (s2 [idx2], ExtenderType.None));
+ idx2++;
+ }
+ }
+
+ int ret = sk1 [0] - sk2 [0];
+ ret = ret != 0 ? ret : sk1 [1] - sk2 [1];
+ if (ret != 0)
+ return ret;
+ if (currentLevel == 1)
+ continue;
+ if (!ignoreNonSpace) {
+ ret = sk1 [2] - sk2 [2];
+ if (ret != 0) {
+ finalResult = ret;
+ currentLevel = frenchSort ? 2 : 1;
+ continue;
+ }
+ }
+ if (currentLevel == 2)
+ continue;
+ ret = sk1 [3] - sk2 [3];
+ if (ret != 0) {
+ finalResult = ret;
+ currentLevel = 2;
+ continue;
+ }
+ if (currentLevel == 3)
+ continue;
+ if (special1 != special2) {
+ finalResult = special1 ? 1 : -1;
+ currentLevel = 3;
+ continue;
+ }
+ if (special1) {
+ ret = CompareFlagPair (
+ !Uni.IsJapaneseSmallLetter ((char) i1),
+ !Uni.IsJapaneseSmallLetter ((char) i2));
+ ret = ret != 0 ? ret :
+ ToDashTypeValue (ext1) -
+ ToDashTypeValue (ext2);
+ ret = ret != 0 ? ret : CompareFlagPair (
+ Uni.IsHiragana ((char) i1),
+ Uni.IsHiragana ((char) i2));
+ ret = ret != 0 ? ret : CompareFlagPair (
+ !IsHalfKana ((char) i1),
+ !IsHalfKana ((char) i2));
+ if (ret != 0) {
+ finalResult = ret;
+ currentLevel = 3;
+ continue;
+ }
+ }
+ }
+
+ // If there were only level 3 or lower differences,
+ // then we still have to find diacritical differences
+ // if any.
+ if (!ignoreNonSpace &&
+ finalResult != 0 && currentLevel > 2) {
+ while (idx1 < end1 && idx2 < end2) {
+ if (!Uni.IsIgnorableNonSpacing (s1 [idx1]))
+ break;
+ if (!Uni.IsIgnorableNonSpacing (s2 [idx2]))
+ break;
+ finalResult = Level2 (FilterOptions ((s1 [idx1])), ext1) - Level2 (FilterOptions (s2 [idx2]), ext2);
+ if (finalResult != 0)
+ break;
+ idx1++;
+ idx2++;
+ // they should work only for the first character
+ ext1 = ExtenderType.None;
+ ext2 = ExtenderType.None;
+ }
+ }
+ if (currentLevel == 1 && finalResult != 0) {
+ while (idx1 < end1)
+ if (Uni.IsIgnorableNonSpacing (s1 [idx1]))
+ idx1++;
+ while (idx2 < end2)
+ if (Uni.IsIgnorableNonSpacing (s2 [idx2]))
+ idx2++;
+ }
+ // we still have to handle level 5
+ if (finalResult == 0) {
+ finalResult = lv5At1 - lv5At2;
+ if (finalResult == 0)
+ finalResult = lv5Value1 - lv5Value2;
+ }
+ if (finalResult == 0) {
+ if (idx2 == end2)
+ targetConsumed = true;
+ if (idx1 == end1)
+ sourceConsumed = true;
+ }
+ return idx1 != end1 ? 1 : idx2 == end2 ? finalResult : -1;
+ }
+
+ int CompareFlagPair (bool b1, bool b2)
+ {
+ return b1 == b2 ? 0 : b1 ? 1 : -1;
}
#endregion
- #region IsPrefix()
+ #region IsPrefix() and IsSuffix()
public bool IsPrefix (string src, string target, CompareOptions opt)
{
public bool IsPrefix (string s, string target, int start, int length, CompareOptions opt)
{
SetOptions (opt);
- return IsPrefix (s, target, start, length) >= 0;
+ return IsPrefix (s, target, start, length,
+ (opt & CompareOptions.StringSort) != 0, true);
}
- // returns the consumed length in positive number, or -1 if
- // target was not a prefix.
- int IsPrefix (string s, string target, int start, int length)
+ public bool IsPrefix (string s, string target, int start, int length, bool stringSort, bool skipHeadingExtenders)
{
- int min = length > target.Length ? target.Length : length;
- int si = start;
-
- // FIXME: this is not enough to handle tailorings.
- for (int ti = 0; ti < min; ti++, si++) {
- // FIXME: should handle expansions (and it
- // should be before codepoint comparison).
- string expansion = GetExpansion (s [si]);
- if (expansion != null)
- return -si;
- expansion = GetExpansion (target [ti]);
- if (expansion != null)
- return -si;
-
- // char-by-char comparison.
- int ret = CompareCharSimple (s, target, ref si, ref ti);
- if (ret < 0)
- return -ret;
- }
- if (length == min) {
- // All codepoints in the compared range
- // matches. In that case, what matters
- // is whether the remaining part of
- // "target" is ignorable or not.
- for (int i = min; i < target.Length; i++)
- if (!IsIgnorable (target [i]))
- return -si;
- return si;
- }
- return si;
- }
-
- private int CompareCharSimple (string s, string target, ref int si, ref int ti)
- {
- // char-by-char comparison.
- if (IsIgnorable (s [si])) {
- if (!IsIgnorable (target [ti]))
- ti--;
- return 0;
- }
- else if (IsIgnorable (target [ti])) {
- si--;
- return 0;
- }
- int ci = FilterOptions (s [si]);
- int cj = FilterOptions (target [ti]);
- if (ci == cj)
- return 0;
- // lv.1 to 3
- if (Category (ci) != Category (cj) ||
- Level1 (ci) != Level1 (cj) ||
- !ignoreNonSpace && Level2 (ci) != Level2 (cj) ||
- Uni.Level3 (ci) != Uni.Level3 (cj))
- return -si;
- // lv.4 (only when required)
- if (!Uni.HasSpecialWeight ((char) ci))
- return 0;
- if (Uni.IsJapaneseSmallLetter ((char) ci) !=
- Uni.IsJapaneseSmallLetter ((char) cj) ||
- Uni.GetJapaneseDashType ((char) ci) !=
- Uni.GetJapaneseDashType ((char) cj) ||
- !Uni.IsHiragana ((char) ci) !=
- !Uni.IsHiragana ((char) cj) ||
- Uni.IsHalfWidthKana ((char) ci) !=
- Uni.IsHalfWidthKana ((char) cj))
- return -si;
- return 0;
+ bool consumed, dummy;
+ int ret = CompareInternal (s, start, length,
+ target, 0, target.Length, stringSort,
+ out consumed, out dummy, skipHeadingExtenders);
+ return consumed;
}
- #endregion
- #region IsSuffix()
+ // IsSuffix()
public bool IsSuffix (string src, string target, CompareOptions opt)
{
public bool IsSuffix (string s, string target, int start, int length, CompareOptions opt)
{
+ // quick check : simple codepoint comparison
+ if (s.Length >= target.Length) {
+ int si = start;
+ int se = start - length;
+ for (int i = target.Length - 1; si >= se && i >= 0; i--, si--)
+ if (s [si] != target [i])
+ break;
+ if (si == start + target.Length)
+ return true;
+ }
+
SetOptions (opt);
- return IsSuffix (s, target, start, length);
+ return IsSuffix (s, target, start, length,
+ (opt & CompareOptions.StringSort) != 0);
}
- bool IsSuffix (string s, string target, int start, int length)
+ bool IsSuffix (string s, string t, int start, int length,
+ bool stringSort)
{
- int min = length > target.Length ? target.Length : length;
- int si = start;
-
- // FIXME: this is not enough to handle tailorings.
- for (int j = min - 1; j >= 0; j--, si--) {
- // FIXME: should handle expansions (and it
- // should be before codepoint comparison).
- string expansion = GetExpansion (s [si]);
- if (expansion != null)
- return false;
- expansion = GetExpansion (target [j]);
- if (expansion != null)
- return false;
+ int tstart = 0;
+ for (;tstart < t.Length; tstart++)
+ if (!IsIgnorable (t [tstart]))
+ break;
+ if (tstart == t.Length)
+ return true; // as if target is String.Empty.
- // char-by-char comparison.
- if (IsIgnorable (s [si])) {
- if (!IsIgnorable (target [j]))
- j++;
- continue;
- }
- else if (IsIgnorable (target [j])) {
- si++;
- continue;
- }
- int ci = FilterOptions (s [si]);
- int cj = FilterOptions (target [j]);
- if (ci == cj)
- continue;
- // lv.1 to 3
- if (Category (ci) != Category (cj) ||
- Level1 (ci) != Level1 (cj) ||
- !ignoreNonSpace && Level2 (ci) != Level2 (cj) ||
- Uni.Level3 (ci) != Uni.Level3 (cj))
+#if false
+// This is still not working. If it is not likely to get working, then just remove it.
+ int si = start;
+ int send = start - length;
+ int ti = t.Length - 1;
+ int tend = -1;
+
+ int sStep = start + 1;
+ int tStep = t.Length;
+ bool sourceConsumed, targetConsumed;
+ while (true) {
+ for (; send < si; si--)
+ if (!IsIgnorable (s [si]))
+ break;
+ for (; tend < ti; ti--)
+ if (!IsIgnorable (t [ti]))
+ break;
+ if (tend == ti)
+ break;
+ for (; send < si; si--)
+ if (IsSafe (s [si]))
+ break;
+ for (; tend < ti; ti--)
+ if (IsSafe (t [ti]))
+ break;
+Console.WriteLine ("==== {0} {1} {2} {3} {4} {5} {6} {7} {8}", s, si, send, length, t, ti, tstart, sStep - si, tStep - ti);
+ if (CompareInternal (s, si, sStep - si,
+ t, ti, tStep - ti, stringSort,
+ out targetConsumed, out sourceConsumed,
+ true) != 0)
return false;
- // lv.4 (only when required)
- if (!Uni.HasSpecialWeight ((char) ci))
- continue;
- if (Uni.IsJapaneseSmallLetter ((char) ci) !=
- Uni.IsJapaneseSmallLetter ((char) cj) ||
- Uni.GetJapaneseDashType ((char) ci) !=
- Uni.GetJapaneseDashType ((char) cj) ||
- !Uni.IsHiragana ((char) ci) !=
- !Uni.IsHiragana ((char) cj) ||
- Uni.IsHalfWidthKana ((char) ci) !=
- Uni.IsHalfWidthKana ((char) cj))
+ if (send == si)
return false;
+ sStep = si;
+ tStep = ti;
+ si--;
+ ti--;
}
- if (si == min) {
- // All codepoints in the compared range
- // matches. In that case, what matters
- // is whether the remaining part of
- // "target" is ignorable or not.
- for (int i = target.Length - min - 1; i >= 0; i--)
- if (!IsIgnorable (target [i]))
+ return true;
+#else
+ // FIXME: it is not efficient for very long target.
+ bool sourceConsumed, targetConsumed;
+ int mismatchCount = 0;
+ for (int i = 0; i < length; i++) {
+ escape1.Source = escape2.Source = null;
+ previousSortKey = previousSortKey2 = null;
+ previousChar = previousChar2 = -1;
+
+ int ret = CompareInternal (s, start - i, i + 1,
+ t, tstart, t.Length - tstart,
+ stringSort, out targetConsumed,
+ out sourceConsumed, true);
+ if (ret == 0)
+ return true;
+ if (!sourceConsumed && targetConsumed)
+ return false;
+ if (!targetConsumed) {
+ mismatchCount++;
+ if (mismatchCount > Uni.MaxExpansionLength)
+ // The largest length of an
+ // expansion is 3, so if the
+ // target was not consumed more
+ // than 3 times, then the tail
+ // character does not match.
return false;
- return true;
+ }
}
- return true;
+ return false;
+#endif
}
#endregion
- #region IndexOf()
+ #region IndexOf() / LastIndexOf()
+
+ public int IndexOf (string s, string target, CompareOptions opt)
+ {
+ return IndexOf (s, target, 0, s.Length, opt);
+ }
+
+ public int IndexOf (string s, string target, int start, int length, CompareOptions opt)
+ {
+ SetOptions (opt);
+ return IndexOf (s, target, start, length,
+ (opt & CompareOptions.StringSort) != 0);
+ }
public int IndexOf (string s, char target, CompareOptions opt)
{
return IndexOf (s, target, 0, s.Length, opt);
}
- // To make implementation simple, it does not handle
- // contractions (impossible) expansions. If the character has
- // an expansion form, it creates a string and invokes
- // the overload w/ string argument.
public int IndexOf (string s, char target, int start, int length, CompareOptions opt)
{
SetOptions (opt);
- // If target has an contraction, then use string search.
+ // If target is contraction, then use string search.
Contraction ct = GetContraction (target);
if (ct != null) {
if (ct.Replacement != null)
- return IndexOf (s, ct.Replacement, start, length);
+ return IndexOf (s, ct.Replacement, start, length,
+ (opt & CompareOptions.StringSort) != 0);
else
return IndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true);
+ } else {
+ int ti = FilterOptions ((int) target);
+ charSortKeyIndexTarget [0] = Category (ti);
+ charSortKeyIndexTarget [1] = Level1 (ti);
+ if (!ignoreNonSpace)
+ charSortKeyIndexTarget [2] =
+ Level2 (ti, ExtenderType.None);
+ charSortKeyIndexTarget [3] = Uni.Level3 (ti);
+ return IndexOfSortKey (s, start, length,
+ charSortKeyIndexTarget, target, ti,
+ !Uni.HasSpecialWeight ((char) ti));
}
- else
- return IndexOfPrimitiveChar (s, start, length, target);
- }
-
- int IndexOfPrimitiveChar (string s, int start, int length, char target)
- {
- int ti = FilterOptions ((int) target);
- charSortKey [0] = Category (ti);
- charSortKey [1] = Level1 (ti);
- if (!ignoreNonSpace)
- charSortKey [2] = Level2 (ti);
- charSortKey [3] = Uni.Level3 (ti);
- return IndexOfSortKey (s, start, length, charSortKey, target, ti, !Uni.HasSpecialWeight ((char) ti));
}
+ // Searches target byte[] keydata
int IndexOfSortKey (string s, int start, int length, byte [] sortkey, char target, int ti, bool noLv4)
{
int end = start + length;
- for (int idx = start; idx < end; idx++) {
- switch (char.GetUnicodeCategory (s [idx])) {
- case UnicodeCategory.PrivateUse:
- case UnicodeCategory.Surrogate:
- if (s [idx] != target)
- continue;
- return idx;
- }
-
- // FIXME: what happens if target is *in the
- // middle of* expansion?
- string expansion = GetExpansion (s [idx]);
- if (expansion != null)
- continue; // since target cannot be expansion as conditioned above.
- if (s [idx] == target)
- return idx;
- int si = FilterOptions ((int) s [idx]);
- if (Category (si) != sortkey [0] ||
- Level1 (si) != sortkey [1] ||
- !ignoreNonSpace && Level2 (si) != sortkey [2] ||
- Uni.Level3 (si) != sortkey [3])
- continue;
- if (noLv4 && !Uni.HasSpecialWeight ((char) si))
- return idx;
- else if (noLv4)
- continue;
- if (Uni.IsJapaneseSmallLetter ((char) si) !=
- Uni.IsJapaneseSmallLetter ((char) ti) ||
- Uni.GetJapaneseDashType ((char) si) !=
- Uni.GetJapaneseDashType ((char) ti) ||
- !Uni.IsHiragana ((char) si) !=
- !Uni.IsHiragana ((char) ti) ||
- Uni.IsHalfWidthKana ((char) si) !=
- Uni.IsHalfWidthKana ((char) ti))
- continue;
- return idx;
+ int idx = start;
+ while (idx < end) {
+ int cur = idx;
+ if (MatchesForward (s, ref idx, end, ti, sortkey, noLv4))
+ return cur;
}
return -1;
}
- public int IndexOf (string s, string target, CompareOptions opt)
+ // Searches string. Search head character (or keydata when
+ // the head is contraction sortkey) and try IsPrefix().
+ int IndexOf (string s, string target, int start, int length, bool stringSort)
{
- return IndexOf (s, target, 0, s.Length, opt);
- }
-
- public int IndexOf (string s, string target, int start, int length, CompareOptions opt)
- {
- SetOptions (opt);
- return IndexOf (s, target, start, length);
- }
-
- int IndexOf (string s, string target, int start, int length)
- {
- Contraction ct = GetContraction (target, 0, target.Length);
- byte [] sortkey = ct != null ? ct.SortKey : null;
+ int tidx = 0;
+ for (; tidx < target.Length; tidx++)
+ if (!IsIgnorable (target [tidx]))
+ break;
+ if (tidx == target.Length)
+ return start;
+ Contraction ct = GetContraction (target, tidx, target.Length - tidx);
string replace = ct != null ? ct.Replacement : null;
+ byte [] sk = replace == null ? charSortKeyIndexTarget : null;
+ bool noLv4 = true;
+ char tc = char.MinValue;
+ int ti = -1;
+ if (ct != null && sk != null) {
+ for (int i = 0; i < ct.SortKey.Length; i++)
+ sk [i] = ct.SortKey [i];
+ } else if (sk != null) {
+ tc = target [tidx];
+ ti = FilterOptions (target [tidx]);
+ sk [0] = Category (ti);
+ sk [1] = Level1 (ti);
+ if (!ignoreNonSpace)
+ sk [2] = Level2 (ti, ExtenderType.None);
+ sk [3] = Uni.Level3 (ti);
+ noLv4 = !Uni.HasSpecialWeight ((char) ti);
+ }
+ if (sk != null) {
+ for (tidx++; tidx < target.Length; tidx++) {
+ if (Category (target [tidx]) != 1)
+ break;
+ if (sk [2] == 0)
+ sk [2] = 2;
+ sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None));
+ }
+ }
+
do {
int idx = 0;
- if (sortkey != null)
- idx = IndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true);
- else if (replace != null)
- idx = IndexOf (ct.Replacement, target, 0, ct.Replacement.Length);
+ if (replace != null)
+ idx = IndexOf (s, replace, start, length, stringSort);
else
- idx = IndexOfPrimitiveChar (s, start, length, target [0]);
+ idx = IndexOfSortKey (s, start, length, sk, tc, ti, noLv4);
if (idx < 0)
return -1;
- if (IsPrefix (s, target, idx, length - (idx - start)) >= 0)
+ length -= idx - start;
+ start = idx;
+ if (IsPrefix (s, target, start, length, stringSort, false))
return idx;
- start++;
- length--;
+ Contraction cts = GetContraction (s, start, length);
+ if (cts != null) {
+ start += cts.Source.Length;
+ length -= cts.Source.Length;
+ } else {
+ start++;
+ length--;
+ }
} while (length > 0);
return -1;
}
- #endregion
+ //
+ // There are the same number of IndexOf() related methods,
+ // with the same functionalities for each.
+ //
- #region LastIndexOf()
+ public int LastIndexOf (string s, string target, CompareOptions opt)
+ {
+ return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
+ }
+
+ public int LastIndexOf (string s, string target, int start, int length, CompareOptions opt)
+ {
+ SetOptions (opt);
+ return LastIndexOf (s, target, start, length,
+ (opt & CompareOptions.StringSort) != 0);
+ }
public int LastIndexOf (string s, char target, CompareOptions opt)
{
return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
}
- // To make implementation simple, it does not handle
- // contractions (impossible) expansions. If the character has
- // an expansion form, it creates a string and invokes
- // the overload w/ string argument.
public int LastIndexOf (string s, char target, int start, int length, CompareOptions opt)
{
SetOptions (opt);
- // If target has an contraction, then use string search.
+ // If target is contraction, then use string search.
Contraction ct = GetContraction (target);
if (ct != null) {
if (ct.Replacement != null)
- return LastIndexOf (s, ct.Replacement, start, length);
+ return LastIndexOf (s, ct.Replacement, start, length, (opt & CompareOptions.StringSort) != 0);
else
- return LastIndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true);
+ return LastIndexOfSortKey (s, start, start, length, ct.SortKey, char.MinValue, -1, true);
+ }
+ else {
+ int ti = FilterOptions ((int) target);
+ charSortKeyIndexTarget [0] = Category (ti);
+ charSortKeyIndexTarget [1] = Level1 (ti);
+ if (!ignoreNonSpace)
+ charSortKeyIndexTarget [2] = Level2 (ti, ExtenderType.None);
+ charSortKeyIndexTarget [3] = Uni.Level3 (ti);
+ return LastIndexOfSortKey (s, start, start, length, charSortKeyIndexTarget, target, ti, !Uni.HasSpecialWeight ((char) ti));
}
- else
- return LastIndexOfPrimitiveChar (s, start, length, target);
- }
-
- int LastIndexOfPrimitiveChar (string s, int start, int length, char target)
- {
- int ti = FilterOptions ((int) target);
- charSortKey [0] = Category (ti);
- charSortKey [1] = Level1 (ti);
- if (!ignoreNonSpace)
- charSortKey [2] = Level2 (ti);
- charSortKey [3] = Uni.Level3 (ti);
- return LastIndexOfSortKey (s, start, length, charSortKey, target, ti, !Uni.HasSpecialWeight ((char) ti));
}
- int LastIndexOfSortKey (string s, int start, int length, byte [] sortkey, char target, int ti, bool noLv4)
+ // Searches target byte[] keydata
+ int LastIndexOfSortKey (string s, int start, int orgStart, int length, byte [] sortkey, char target, int ti, bool noLv4)
{
int end = start - length;
-
- for (int idx = start; idx > end; idx--) {
- switch (char.GetUnicodeCategory (s [idx])) {
- case UnicodeCategory.PrivateUse:
- case UnicodeCategory.Surrogate:
- if (s [idx] != target)
- continue;
- return idx;
- }
-
- // FIXME: what happens if target is *in the
- // middle of* expansion?
- string expansion = GetExpansion (s [idx]);
- if (expansion != null)
- continue; // since target cannot be expansion as conditioned above.
- if (s [idx] == target)
- return idx;
- int si = FilterOptions ((int) s [idx]);
- if (Category (si) != sortkey [0] ||
- Level1 (si) != sortkey [1] ||
- !ignoreNonSpace && Level2 (si) != sortkey [2] ||
- Uni.Level3 (si) != sortkey [3])
- continue;
- if (noLv4 && !Uni.HasSpecialWeight ((char) si))
- return idx;
- else if (noLv4)
- continue;
- if (Uni.IsJapaneseSmallLetter ((char) si) !=
- Uni.IsJapaneseSmallLetter ((char) ti) ||
- Uni.GetJapaneseDashType ((char) si) !=
- Uni.GetJapaneseDashType ((char) ti) ||
- !Uni.IsHiragana ((char) si) !=
- !Uni.IsHiragana ((char) ti) ||
- Uni.IsHalfWidthKana ((char) si) !=
- Uni.IsHalfWidthKana ((char) ti))
- continue;
- return idx;
+ int idx = start;
+ while (idx > end) {
+ int cur = idx;
+ if (MatchesBackward (s, ref idx, end, orgStart, ti, sortkey, noLv4))
+ return cur;
}
return -1;
}
- public int LastIndexOf (string s, string target, CompareOptions opt)
- {
- return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
- }
-
- public int LastIndexOf (string s, string target, int start, int length, CompareOptions opt)
- {
- SetOptions (opt);
- return LastIndexOf (s, target, start, length);
- }
-
- int LastIndexOf (string s, string target, int start, int length)
+ // Searches string. Search head character (or keydata when
+ // the head is contraction sortkey) and try IsPrefix().
+ int LastIndexOf (string s, string target, int start, int length, bool stringSort)
{
int orgStart = start;
+ int tidx = 0;
+ for (; tidx < target.Length; tidx++)
+ if (!IsIgnorable (target [tidx]))
+ break;
+ if (tidx == target.Length)
+ return start;
+ Contraction ct = GetContraction (target, tidx, target.Length - tidx);
+ string replace = ct != null ? ct.Replacement : null;
+ byte [] sk = replace == null ? charSortKeyIndexTarget : null;
+
+ bool noLv4 = true;
+ char tc = char.MinValue;
+ int ti = -1;
+ if (ct != null && sk != null) {
+ for (int i = 0; i < ct.SortKey.Length; i++)
+ sk [i] = ct.SortKey [i];
+ } else if (sk != null) {
+ tc = target [tidx];
+ ti = FilterOptions (target [tidx]);
+ sk [0] = Category (ti);
+ sk [1] = Level1 (ti);
+ if (!ignoreNonSpace)
+ sk [2] = Level2 (ti, ExtenderType.None);
+ sk [3] = Uni.Level3 (ti);
+ noLv4 = !Uni.HasSpecialWeight ((char) ti);
+ }
+ if (sk != null) {
+ for (tidx++; tidx < target.Length; tidx++) {
+ if (Category (target [tidx]) != 1)
+ break;
+ if (sk [2] == 0)
+ sk [2] = 2;
+ sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None));
+ }
+ }
do {
int idx = 0;
- Contraction ct = GetContraction (s, start, length);
- if (ct != null) {
- if (ct.SortKey != null)
- idx = LastIndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true);
- else
- idx = LastIndexOf (ct.Replacement, target, ct.Replacement.Length - 1, ct.Replacement.Length);
- }
- else
- idx = LastIndexOfPrimitiveChar (s, start, length, target [0]);
+ if (replace != null)
+ idx = LastIndexOf (s, replace, start, length, stringSort);
+ else
+ idx = LastIndexOfSortKey (s, start, orgStart, length, sk, tc, ti, noLv4);
if (idx < 0)
return -1;
- if (IsPrefix (s, target, idx, orgStart - idx + 1) >= 0)
+ length -= start - idx;
+ start = idx;
+
+ if (IsPrefix (s, target, idx, orgStart - idx + 1, stringSort, false)) {
+ for (;idx < orgStart; idx++)
+ if (!IsIgnorable (s [idx]))
+ break;
return idx;
- length--;
- start--;
+ }
+ Contraction cts = GetContraction (s, idx, orgStart - idx + 1);
+ if (cts != null) {
+ start -= cts.Source.Length;
+ length -= cts.Source.Length;
+ } else {
+ start--;
+ length--;
+ }
} while (length > 0);
return -1;
}
+ private bool MatchesForward (string s, ref int idx, int end, int ti, byte [] sortkey, bool noLv4)
+ {
+ int si = -1;
+ ExtenderType ext = GetExtenderType (s [idx]);
+ Contraction ct = null;
+ if (ext == ExtenderType.None)
+ ct = GetContraction (s, idx, end);
+ else if (previousChar < 0) {
+ if (previousSortKey == null) {
+ idx++;
+ return false;
+ }
+ charSortKey = previousSortKey;
+ }
+ else
+ si = FilterExtender (previousChar, ext);
+ // if lv4 exists, it never matches contraction
+ if (ct != null) {
+ idx += ct.Source.Length;
+ if (!noLv4)
+ return false;
+ if (ct.SortKey != null) {
+ for (int i = 0; i < sortkey.Length; i++)
+ charSortKey [i] = sortkey [i];
+ previousChar = -1;
+ previousSortKey = charSortKey;
+ } else {
+ // Here is the core of LAMESPEC
+ // described at the top of the source.
+ int dummy = 0;
+ return MatchesForward (ct.Replacement, ref dummy,
+ ct.Replacement.Length, ti, sortkey, noLv4);
+ }
+ } else {
+ if (si < 0)
+ si = FilterOptions (s [idx]);
+ charSortKey [0] = Category (si);
+ charSortKey [1] = Level1 (si);
+ if (!ignoreNonSpace)
+ charSortKey [2] = Level2 (si, ext);
+ charSortKey [3] = Uni.Level3 (si);
+ if (charSortKey [0] != 1)
+ previousChar = si;
+ idx++;
+ }
+ for (; idx < end; idx++) {
+ if (Category (s [idx]) != 1)
+ break;
+ if (ignoreNonSpace)
+ continue;
+ if (charSortKey [2] == 0)
+ charSortKey [2] = 2;
+ charSortKey [2] = (byte) (charSortKey [2]
+ + Level2 (s [idx], ExtenderType.None));
+ }
+
+ return MatchesPrimitive (charSortKey, si, ext, sortkey, ti, noLv4);
+ }
+
+ private bool MatchesPrimitive (byte [] charSortKey, int si, ExtenderType ext, byte [] sortkey, int ti, bool noLv4)
+ {
+ if (charSortKey [0] != sortkey [0] ||
+ charSortKey [1] != sortkey [1] ||
+ (!ignoreNonSpace && charSortKey [2] != sortkey [2]) ||
+ charSortKey [3] != sortkey [3])
+ return false;
+ if (noLv4 && (si < 0 || !Uni.HasSpecialWeight ((char) si)))
+ return true;
+ else if (noLv4)
+ return false;
+ // Since target can never be an extender, if the source
+ // is an expander and it matters, then they never match.
+ if (!ignoreNonSpace && ext == ExtenderType.Conditional)
+ return false;
+ if (Uni.IsJapaneseSmallLetter ((char) si) !=
+ Uni.IsJapaneseSmallLetter ((char) ti) ||
+ ToDashTypeValue (ext) !=
+ // FIXME: we will have to specify correct value for target
+ ToDashTypeValue (ExtenderType.None) ||
+ !Uni.IsHiragana ((char) si) !=
+ !Uni.IsHiragana ((char) ti) ||
+ IsHalfKana ((char) si) !=
+ IsHalfKana ((char) ti))
+ return false;
+ return true;
+ }
+
+ private bool MatchesBackward (string s, ref int idx, int end, int orgStart, int ti, byte [] sortkey, bool noLv4)
+ {
+ int cur = idx;
+ int si = -1;
+ ExtenderType ext = GetExtenderType (s [idx]);
+ // To handle extenders in source, we need to
+ // check next _primary_ character.
+ if (ext != ExtenderType.None) {
+ byte diacritical = 0;
+ for (int tmp = 0; ; tmp--) {
+ if (tmp < 0) // heading extender
+ return false;
+ if (IsIgnorable (s [tmp]))
+ continue;
+ int tmpi = FilterOptions (s [tmp]);
+ byte category = Category (tmpi);
+ if (category == 1) {
+ diacritical = Level2 (tmpi, ExtenderType.None);
+ continue;
+ }
+ si = FilterExtender (tmpi, ext);
+ charSortKey [0] = category;
+ charSortKey [1] = Level1 (si);
+ if (!ignoreNonSpace)
+ charSortKey [2] = Level2 (si, ext);
+ charSortKey [3] = Uni.Level3 (si);
+ if (ext != ExtenderType.Conditional &&
+ diacritical != 0)
+ charSortKey [2] =
+ (charSortKey [2] == 0) ?
+ (byte) (diacritical + 2) :
+ diacritical;
+ break;
+ }
+ idx--;
+ }
+ Contraction ct = null;
+ if (ext == ExtenderType.None)
+ ct = GetContraction (s, idx, end);
+ // if lv4 exists, it never matches contraction
+ if (ct != null) {
+ idx -= ct.Source.Length;
+ if (!noLv4)
+ return false;
+ if (ct.SortKey != null) {
+ for (int i = 0; i < sortkey.Length; i++)
+ charSortKey [i] = sortkey [i];
+ previousChar = -1;
+ previousSortKey = charSortKey;
+ } else {
+ // Here is the core of LAMESPEC
+ // described at the top of the source.
+ int dummy = ct.Replacement.Length - 1;
+ return MatchesBackward (ct.Replacement,
+ ref dummy, dummy, -1, ti, sortkey, noLv4);
+ }
+ } else if (ext == ExtenderType.None) {
+ if (si < 0)
+ si = FilterOptions (s [idx]);
+ charSortKey [0] = Category (si);
+ charSortKey [1] = Level1 (si);
+ if (!ignoreNonSpace)
+ charSortKey [2] = Level2 (si, ext);
+ charSortKey [3] = Uni.Level3 (si);
+ if (charSortKey [0] != 1)
+ previousChar = si;
+ idx--;
+ }
+ if (ext == ExtenderType.None) {
+ for (int tmp = cur + 1; tmp < orgStart; tmp++) {
+ if (Category (s [tmp]) != 1)
+ break;
+ if (ignoreNonSpace)
+ continue;
+ if (charSortKey [2] == 0)
+ charSortKey [2] = 2;
+ charSortKey [2] = (byte) (charSortKey [2]
+ + Level2 (s [tmp], ExtenderType.None));
+ }
+ }
+ return MatchesPrimitive (charSortKey, si, ext, sortkey, ti, noLv4);
+ }
#endregion
}
}