X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=blobdiff_plain;f=mcs%2Fclass%2Fcorlib%2FMono.Globalization.Unicode%2FSimpleCollator.cs;h=cbe4d904eb17cdb8836bd2eb0ef914f839aff62b;hb=623803f165f9917497606d5e514609a71963fffa;hp=8c3c96ca8c7f0d4807049abadd876d143ab9ccbb;hpb=ae90babecc203a5b8e84af4bace27da1b6f40137;p=mono.git diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/SimpleCollator.cs b/mcs/class/corlib/Mono.Globalization.Unicode/SimpleCollator.cs index 8c3c96ca8c7..cbe4d904eb1 100644 --- a/mcs/class/corlib/Mono.Globalization.Unicode/SimpleCollator.cs +++ b/mcs/class/corlib/Mono.Globalization.Unicode/SimpleCollator.cs @@ -23,10 +23,6 @@ // other character than the identical sequence (depending on CompareOptions // and of course, defined sortkey value itself), comparison cannot be done // at source char[] level. -// -// (to be continued.) -// - // // In IndexOf(), the first character in the target string or the target char // itself is turned into sortkey bytes. If the character has a contraction and @@ -37,7 +33,6 @@ // tries IsPrefix() from that location. If it returns true, then it returns // the index. // - // LAMESPEC: IndexOf() is lame as a whole API. It never matches in the middle // of expansion and there is no proper way to return such indexes within // a single int return value. @@ -62,30 +57,6 @@ namespace Mono.Globalization.Unicode static SimpleCollator invariant = new SimpleCollator (CultureInfo.InvariantCulture); - internal static readonly byte [] ignorableFlags = - Uni.ignorableFlags; - internal static readonly byte [] categories = - Uni.categories; - internal static readonly byte [] level1 = - Uni.level1; - internal static readonly byte [] level2 = - Uni.level2; - internal static readonly byte [] level3 = - Uni.level3; - internal static readonly ushort [] widthCompat = - Uni.widthCompat; - internal static readonly CodePointIndexer categoryIndexer = - UUtil.Category; - internal static readonly CodePointIndexer lv1Indexer = - UUtil.Level1; - internal static readonly CodePointIndexer lv2Indexer = - UUtil.Level2; - internal static readonly CodePointIndexer lv3Indexer = - UUtil.Level3; - internal static readonly CodePointIndexer widthIndexer = - UUtil.WidthCompat; - - SortKeyBuffer buf; // CompareOptions expanded. bool ignoreNonSpace; // used in IndexOf() @@ -95,92 +66,25 @@ namespace Mono.Globalization.Unicode bool ignoreKanaType; TextInfo textInfo; // for ToLower(). bool frenchSort; - readonly ushort [] cjkTable; + unsafe readonly byte* cjkCatTable; + unsafe readonly byte* cjkLv1Table; readonly CodePointIndexer cjkIndexer; - readonly byte [] cjkLv2Table; + unsafe readonly byte* cjkLv2Table; readonly CodePointIndexer cjkLv2Indexer; readonly int lcid; readonly Contraction [] contractions; readonly Level2Map [] level2Maps; + // This flag marks characters as "unsafe", where the character + // could be used as part of a contraction (whose length > 1). + readonly bool [] unsafeFlags; + + const int UnsafeFlagLength = 0x300; + // temporary sortkey buffer for index search/comparison byte [] charSortKey = new byte [4]; byte [] charSortKey2 = new byte [4]; - // temporary expansion store for IsPrefix/Suffix - int escapedSourceIndex; - - #region Tailoring support classes - // Possible mapping types are: - // - // - string to string (ReplacementMap) - // - string to SortKey (SortKeyMap) - // - diacritical byte to byte (DiacriticalMap) - // - // There could be mapping from string to sortkeys, but - // for now there is none as such. - // - internal class Contraction - { - public readonly char [] Source; - // only either of them is used. - public readonly string Replacement; - public readonly byte [] SortKey; - - public Contraction (char [] source, - string replacement, byte [] sortkey) - { - Source = source; - Replacement = replacement; - SortKey = sortkey; - } - } - - internal class ContractionComparer : IComparer - { - public static readonly ContractionComparer Instance = - new ContractionComparer (); - - public int Compare (object o1, object o2) - { - Contraction c1 = (Contraction) o1; - Contraction c2 = (Contraction) o2; - char [] a1 = c1.Source; - char [] a2 = c2.Source; - int min = a1.Length > a2.Length ? - a2.Length : a1.Length; - for (int i = 0; i < min; i++) - if (a1 [i] != a2 [i]) - return a1 [i] - a2 [i]; - return a1.Length - a2.Length; - } - } - - internal class Level2Map - { - public byte Source; - public byte Replace; - - public Level2Map (byte source, byte replace) - { - Source = source; - Replace = replace; - } - } - - internal class Level2MapComparer : IComparer - { - public static readonly Level2MapComparer Instance = - new Level2MapComparer (); - - public int Compare (object o1, object o2) - { - Level2Map m1 = (Level2Map) o1; - Level2Map m2 = (Level2Map) o2; - return (m1.Source - m2.Source); - } - } - - #endregion + byte [] charSortKeyIndexTarget = new byte [4]; #region .ctor() and split functions @@ -190,8 +94,11 @@ namespace Mono.Globalization.Unicode textInfo = culture.TextInfo; buf = new SortKeyBuffer (culture.LCID); - SetCJKTable (culture, ref cjkTable, ref cjkIndexer, - ref cjkLv2Table, ref cjkLv2Indexer); + unsafe { + SetCJKTable (culture, ref cjkIndexer, + ref cjkCatTable, ref cjkLv1Table, + ref cjkLv2Indexer, ref cjkLv2Table); + } // Get tailoring info TailoringInfo t = null; @@ -204,8 +111,14 @@ namespace Mono.Globalization.Unicode t = Uni.GetTailoringInfo (127); frenchSort = t.FrenchSort; - BuildTailoringTables (culture, t, ref contractions, + Uni.BuildTailoringTables (culture, t, ref contractions, ref level2Maps); + unsafeFlags = new bool [UnsafeFlagLength]; + foreach (Contraction c in contractions) + if (c.Source.Length > 1) + foreach (char ch in c.Source) + unsafeFlags [(int) ch] = true; + // FIXME: Since tailorings are mostly for latin // (and in some cases Cyrillic) characters, it would // be much better for performance to store "start @@ -223,97 +136,15 @@ Console.WriteLine (" -> '{0}'", c.Replacement); */ } - private void BuildTailoringTables (CultureInfo culture, - TailoringInfo t, - ref Contraction [] contractions, - ref Level2Map [] diacriticals) - { - // collect tailoring entries. - ArrayList cmaps = new ArrayList (); - ArrayList dmaps = new ArrayList (); - char [] tarr = Uni.TailoringValues; - int idx = t.TailoringIndex; - int end = idx + t.TailoringCount; - while (idx < end) { - int ss = idx + 1; - char [] src = null; - switch (tarr [idx]) { - case '\x1': // SortKeyMap - idx++; - while (tarr [ss] != 0) - ss++; - src = new char [ss - idx]; - Array.Copy (tarr, idx, src, 0, ss - idx); - byte [] sortkey = new byte [4]; - for (int i = 0; i < 4; i++) - sortkey [i] = (byte) tarr [ss + 1 + i]; - cmaps.Add (new Contraction ( - src, null, sortkey)); - // it ends with 0 - idx = ss + 6; - break; - case '\x2': // DiacriticalMap - dmaps.Add (new Level2Map ( - (byte) tarr [idx + 1], - (byte) tarr [idx + 2])); - idx += 3; - break; - case '\x3': // ReplacementMap - idx++; - while (tarr [ss] != 0) - ss++; - src = new char [ss - idx]; - Array.Copy (tarr, idx, src, 0, ss - idx); - ss++; - int l = ss; - while (tarr [l] != 0) - l++; - string r = new string (tarr, ss, l - ss); - cmaps.Add (new Contraction ( - src, r, null)); - idx = l + 1; - break; - default: - throw new NotImplementedException (String.Format ("Mono INTERNAL ERROR (Should not happen): Collation tailoring table is broken for culture {0} ({1}) at 0x{2:X}", culture.LCID, culture.Name, idx)); - } - } - cmaps.Sort (ContractionComparer.Instance); - dmaps.Sort (Level2MapComparer.Instance); - contractions = cmaps.ToArray (typeof (Contraction)) - as Contraction []; - diacriticals = dmaps.ToArray (typeof (Level2Map)) - as Level2Map []; - } - - private void SetCJKTable (CultureInfo culture, - ref ushort [] cjkTable, ref CodePointIndexer cjkIndexer, - ref byte [] cjkLv2Table, ref CodePointIndexer cjkLv2Indexer) + unsafe private void SetCJKTable ( + CultureInfo culture, ref CodePointIndexer cjkIndexer, + ref byte* catTable, ref byte* lv1Table, + ref CodePointIndexer lv2Indexer, ref byte* lv2Table) { string name = GetNeutralCulture (culture).Name; - Uni.FillCJK (name); - - // custom CJK table support. - switch (name) { - case "zh-CHS": - cjkTable = Uni.CjkCHS; - cjkIndexer = UUtil.CjkCHS; - break; - case "zh-CHT": - cjkTable = Uni.CjkCHT; - cjkIndexer = UUtil.Cjk; - break; - case "ja": - cjkTable = Uni.CjkJA; - cjkIndexer = UUtil.Cjk; - break; - case "ko": - cjkTable = Uni.CjkKO; - cjkLv2Table = Uni.CjkKOLv2; - cjkIndexer = UUtil.Cjk; - cjkLv2Indexer = UUtil.Cjk; - break; - } + Uni.FillCJK (name, ref cjkIndexer, ref catTable, + ref lv1Table, ref lv2Indexer, ref lv2Table); } static CultureInfo GetNeutralCulture (CultureInfo info) @@ -326,35 +157,36 @@ Console.WriteLine (" -> '{0}'", c.Replacement); #endregion - byte Category (int cp) + unsafe byte Category (int cp) { - if (cp < 0x3000 || cjkTable == null) - return categories [categoryIndexer.ToIndex (cp)]; + if (cp < 0x3000 || cjkCatTable == null) + return Uni.Category (cp); int idx = cjkIndexer.ToIndex (cp); - ushort cjk = idx < 0 ? (ushort) 0 : cjkTable [idx]; - return cjk != 0 ? (byte) ((cjk & 0xFF00) >> 8) : - categories [categoryIndexer.ToIndex (cp)]; + return idx < 0 ? Uni.Category (cp) : cjkCatTable [idx]; } - byte Level1 (int cp) + unsafe byte Level1 (int cp) { - if (cp < 0x3000 || cjkTable == null) - return level1 [lv1Indexer.ToIndex (cp)]; + if (cp < 0x3000 || cjkLv1Table == null) + return Uni.Level1 (cp); int idx = cjkIndexer.ToIndex (cp); - ushort cjk = idx < 0 ? (ushort) 0 : cjkTable [idx]; - return cjk != 0 ? (byte) (cjk & 0xFF) : - level1 [lv1Indexer.ToIndex (cp)]; + return idx < 0 ? Uni.Level1 (cp) : cjkLv1Table [idx]; } - byte Level2 (int cp) + unsafe byte Level2 (int cp, ExtenderType ext) { + if (ext == ExtenderType.Buggy) + return 5; + else if (ext == ExtenderType.Conditional) + return 0; + if (cp < 0x3000 || cjkLv2Table == null) - return level2 [lv2Indexer.ToIndex (cp)]; + return Uni.Level2 (cp); int idx = cjkLv2Indexer.ToIndex (cp); byte ret = idx < 0 ? (byte) 0 : cjkLv2Table [idx]; if (ret != 0) return ret; - ret = level2 [lv2Indexer.ToIndex (cp)]; + ret = Uni.Level2 (cp); if (level2Maps.Length == 0) return ret; for (int i = 0; i < level2Maps.Length; i++) { @@ -378,6 +210,9 @@ Console.WriteLine (" -> '{0}'", c.Replacement); this.ignoreWidth = (options & CompareOptions.IgnoreWidth) != 0; this.ignoreCase = (options & CompareOptions.IgnoreCase) != 0; this.ignoreKanaType = (options & CompareOptions.IgnoreKanaType) != 0; + previousChar = previousChar2 = -1; + previousSortKey = previousSortKey2 = null; + escape1.Source = escape2.Source = null; } Contraction GetContraction (string s, int start, int end) @@ -468,7 +303,7 @@ Console.WriteLine (" -> '{0}'", c.Replacement); int FilterOptions (int i) { if (ignoreWidth) { - int x = widthCompat [widthIndexer.ToIndex (i)]; + int x = Uni.ToWidthCompat (i); if (x != 0) i = x; } @@ -479,31 +314,10 @@ Console.WriteLine (" -> '{0}'", c.Replacement); return i; } - #region GetSortKey() - - public SortKey GetSortKey (string s) - { - return GetSortKey (s, CompareOptions.None); - } - - public SortKey GetSortKey (string s, CompareOptions options) - { - return GetSortKey (s, 0, s.Length, options); - } - - public SortKey GetSortKey (string s, int start, int length, CompareOptions options) - { - SetOptions (options); - - buf.Initialize (options, lcid, s, frenchSort); - int end = start + length; - previousChar = -1; - GetSortKey (s, start, end); - return buf.GetResultAndReset (); - } - int previousChar = -1; byte [] previousSortKey = null; + int previousChar2 = -1; + byte [] previousSortKey2 = null; enum ExtenderType { None, @@ -521,12 +335,25 @@ Console.WriteLine (" -> '{0}'", c.Replacement); // Windows also expects true for U+3031 and U+3032, // but they should *never* repeat one character. + // U+2015 becomes an extender only when it is Japanese + if (i == 0x2015) + return lcid == 16 ? ExtenderType.Conditional : + ExtenderType.None; + if (i < 0x3005 || i > 0xFF70) return ExtenderType.None; - if (i == 0xFE7C || i == 0xFE7D) - return ExtenderType.Simple; - if (i == 0xFF70) - return ExtenderType.Conditional; + if (i >= 0xFE7C) { + switch (i) { + case 0xFE7C: + case 0xFE7D: + return ExtenderType.Simple; + case 0xFF70: + return ExtenderType.Conditional; + case 0xFF9E: + case 0xFF9F: + return ExtenderType.Voiced; + } + } if (i > 0x30FE) return ExtenderType.None; switch (i) { @@ -561,6 +388,28 @@ Console.WriteLine (" -> '{0}'", c.Replacement); } } + int FilterExtender (int i, ExtenderType ext) + { + if (ext == ExtenderType.Conditional && + Uni.HasSpecialWeight ((char) i)) { + bool half = IsHalfKana ((char) i); + bool katakana = !Uni.IsHiragana ((char) i); + switch (Level1 (i) & 7) { + case 2: + return half ? 0xFF71 : katakana ? 0x30A2 : 0x3042; + case 3: + return half ? 0xFF72 : katakana ? 0x30A4 : 0x3044; + case 4: + return half ? 0xFF73 : katakana ? 0x30A6 : 0x3046; + case 5: + return half ? 0xFF74 : katakana ? 0x30A8 : 0x3048; + case 6: + return half ? 0xFF75 : katakana ? 0x30AA : 0x304A; + } + } + return i; + } + bool IsIgnorable (int i) { return Uni.IsIgnorable (i) || @@ -568,6 +417,33 @@ Console.WriteLine (" -> '{0}'", c.Replacement); ignoreNonSpace && Uni.IsIgnorableNonSpacing (i); } + bool IsSafe (int i) + { + return i >= unsafeFlags.Length ? true : !unsafeFlags [i]; + } + + #region GetSortKey() + + public SortKey GetSortKey (string s) + { + return GetSortKey (s, CompareOptions.None); + } + + public SortKey GetSortKey (string s, CompareOptions options) + { + return GetSortKey (s, 0, s.Length, options); + } + + public SortKey GetSortKey (string s, int start, int length, CompareOptions options) + { + SetOptions (options); + + buf.Initialize (options, lcid, s, frenchSort); + int end = start + length; + GetSortKey (s, start, end); + return buf.GetResultAndReset (); + } + void GetSortKey (string s, int start, int end) { for (int n = start; n < end; n++) { @@ -575,7 +451,7 @@ Console.WriteLine (" -> '{0}'", c.Replacement); ExtenderType ext = GetExtenderType (i); if (ext != ExtenderType.None) { - i = previousChar; + i = FilterExtender (previousChar, ext); if (i >= 0) FillSortKeyRaw (i, ext); else if (previousSortKey != null) { @@ -583,8 +459,8 @@ Console.WriteLine (" -> '{0}'", c.Replacement); buf.AppendNormal ( b [0], b [1], - b [2] != 1 ? b [2] : Level2 (i), - b [3] != 1 ? b [3] : level3 [lv3Indexer.ToIndex (i)]); + b [2] != 1 ? b [2] : Level2 (i, ext), + b [3] != 1 ? b [3] : Uni.Level3 (i)); } // otherwise do nothing. // (if the extender is the first char @@ -605,15 +481,16 @@ Console.WriteLine (" -> '{0}'", c.Replacement); buf.AppendNormal ( b [0], b [1], - b [2] != 1 ? b [2] : Level2 (i), - b [3] != 1 ? b [3] : level3 [lv3Indexer.ToIndex (i)]); + b [2] != 1 ? b [2] : Level2 (i, ext), + b [3] != 1 ? b [3] : Uni.Level3 (i)); previousSortKey = b; previousChar = -1; } n += ct.Source.Length - 1; } else { - previousChar = i; + if (!Uni.IsIgnorableNonSpacing (i)) + previousChar = i; FillSortKeyRaw (i, ExtenderType.None); } } @@ -644,13 +521,9 @@ Console.WriteLine (" -> '{0}'", c.Replacement); return; } - byte level2 = Level2 (i); - if (ext == ExtenderType.Buggy) - level2 = 5; + byte level2 = Level2 (i, ext); if (Uni.HasSpecialWeight ((char) i)) { byte level1 = Level1 (i); - if (ext == ExtenderType.Conditional) - level1 = (byte) ((level1 & 0xF) % 8); buf.AppendKana ( Category (i), level1, @@ -725,6 +598,7 @@ Console.WriteLine (" -> '{0}'", c.Replacement); public int Index; public int Start; public int End; + public int Optional; } // Those instances are reused not to invoke instantiation @@ -773,20 +647,23 @@ Console.WriteLine (" -> '{0}'", c.Replacement); return d1.Length == d2.Length ? 0 : d1.Length < d2.Length ? -1 : 1; #else SetOptions (options); - escape1.Source = null; - escape2.Source = null; - int ret = Compare (s1, idx1, len1, s2, idx2, len2, (options & CompareOptions.StringSort) != 0); + bool dummy, dummy2; + int ret = CompareInternal (s1, idx1, len1, s2, idx2, len2, (options & CompareOptions.StringSort) != 0, out dummy, out dummy2, true); return ret == 0 ? 0 : ret < 0 ? -1 : 1; #endif } - int Compare (string s1, int idx1, int len1, string s2, - int idx2, int len2, bool stringSort) + int CompareInternal (string s1, int idx1, int len1, string s2, + int idx2, int len2, bool stringSort, + out bool targetConsumed, out bool sourceConsumed, + bool skipHeadingExtenders) { int start1 = idx1; int start2 = idx2; int end1 = idx1 + len1; int end2 = idx2 + len2; + targetConsumed = false; + sourceConsumed = false; // It holds final result that comes from the comparison // at level 2 or lower. Even if Compare() found the @@ -803,6 +680,22 @@ Console.WriteLine (" -> '{0}'", c.Replacement); int lv5Value1 = 0; int lv5Value2 = 0; + // Skip heading extenders + if (skipHeadingExtenders) { + for (; idx1 < end1; idx1++) + if (GetExtenderType (s1 [idx1]) == ExtenderType.None) + break; + for (; idx2 < end2; idx2++) + if (GetExtenderType (s2 [idx2]) == ExtenderType.None) + break; + } + + ExtenderType ext1 = ExtenderType.None; + ExtenderType ext2 = ExtenderType.None; + + int quickCheckPos1 = idx1; + int quickCheckPos2 = idx2; + while (true) { for (; idx1 < end1; idx1++) if (!IsIgnorable (s1 [idx1])) @@ -818,7 +711,9 @@ Console.WriteLine (" -> '{0}'", c.Replacement); start1 = escape1.Start; idx1 = escape1.Index; end1 = escape1.End; + quickCheckPos1 = escape1.Optional; escape1.Source = null; + continue; } if (idx2 >= end2) { if (escape2.Source == null) @@ -827,20 +722,48 @@ Console.WriteLine (" -> '{0}'", c.Replacement); start2 = escape2.Start; idx2 = escape2.Index; end2 = escape2.End; + quickCheckPos2 = escape2.Optional; escape2.Source = null; + continue; } -#if false -// FIXME: optimization could be done here. +#if true +// If comparison is unstable, then this part is one of the most doubtful part. +// Here we run quick codepoint comparison and run back to "stable" area to +// compare characters. + + // Strictly to say, even the first character + // could be compared here, but it messed + // backward step, so I just avoided mess. + if (quickCheckPos1 < idx1 && quickCheckPos2 < idx2) { + while (idx1 < end1 && idx2 < end2 && + s1 [idx1] == s2 [idx2]) { + idx1++; + idx2++; + } + if (idx1 == end1 || idx2 == end2) + continue; // check replacement - if (s1 [idx1] == s2 [idx2]) { - idx1++; - idx2++; - continue; + int backwardEnd1 = quickCheckPos1; + int backwardEnd2 = quickCheckPos2; + quickCheckPos1 = idx1; + quickCheckPos2 = idx2; + + idx1--; + idx2--; + + for (;idx1 > backwardEnd1; idx1--) + if (Category (s1 [idx1]) != 1) + break; + for (;idx2 > backwardEnd2; idx2--) + if (Category (s2 [idx2]) != 1) + break; + for (;idx1 > backwardEnd1; idx1--) + if (IsSafe (s1 [idx1])) + break; + for (;idx2 > backwardEnd2; idx2--) + if (IsSafe (s2 [idx2])) + break; } -// while (idx1 >= start1 && !IsSafe ((int) s [idx1])) -// idx1--; -// while (idx2 >= start2 && !IsSafe ((int) s [idx2])) -// idx2--; #endif int cur1 = idx1; @@ -851,6 +774,36 @@ Console.WriteLine (" -> '{0}'", c.Replacement); int i2 = FilterOptions (s2 [idx2]); bool special1 = false; bool special2 = false; + + // If current character is an expander, then + // repeat the previous character. + ext1 = GetExtenderType (i1); + if (ext1 != ExtenderType.None) { + if (previousChar < 0) { + if (previousSortKey == null) { + // nothing to extend + idx1++; + continue; + } + sk1 = previousSortKey; + } + else + i1 = FilterExtender (previousChar, ext1); + } + ext2 = GetExtenderType (i2); + if (ext2 != ExtenderType.None) { + if (previousChar2 < 0) { + if (previousSortKey2 == null) { + // nothing to extend + idx2++; + continue; + } + sk2 = previousSortKey2; + } + else + i2 = FilterExtender (previousChar2, ext2); + } + byte cat1 = Category (i1); byte cat2 = Category (i2); @@ -863,6 +816,7 @@ Console.WriteLine (" -> '{0}'", c.Replacement); // here Windows has a bug that it does // not consider thirtiary weight. lv5Value1 = Level1 (i1) << 8 + Uni.Level3 (i1); + previousChar = i1; idx1++; } if (cat2 == 6) { @@ -872,6 +826,7 @@ Console.WriteLine (" -> '{0}'", c.Replacement); // here Windows has a bug that it does // not consider thirtiary weight. lv5Value2 = Level1 (i2) << 8 + Uni.Level3 (i2); + previousChar2 = i2; idx2++; } if (cat1 == 6 || cat2 == 6) { @@ -880,24 +835,33 @@ Console.WriteLine (" -> '{0}'", c.Replacement); } } - Contraction ct1 = GetContraction (s1, idx1, end1); + Contraction ct1 = null; + if (ext1 == ExtenderType.None) + ct1 = GetContraction (s1, idx1, end1); + int offset1 = 1; - if (ct1 != null) { + if (sk1 != null) + offset1 = 1; + else if (ct1 != null) { offset1 = ct1.Source.Length; if (ct1.SortKey != null) { sk1 = charSortKey; for (int i = 0; i < ct1.SortKey.Length; i++) sk1 [i] = ct1.SortKey [i]; + previousChar = -1; + previousSortKey = sk1; } else if (escape1.Source == null) { escape1.Source = s1; escape1.Start = start1; escape1.Index = cur1 + ct1.Source.Length; escape1.End = end1; + escape1.Optional = quickCheckPos1; s1 = ct1.Replacement; idx1 = 0; start1 = 0; end1 = s1.Length; + quickCheckPos1 = 0; continue; } } @@ -906,30 +870,41 @@ Console.WriteLine (" -> '{0}'", c.Replacement); sk1 [0] = cat1; sk1 [1] = Level1 (i1); if (!ignoreNonSpace && currentLevel > 1) - sk1 [2] = Level2 (i1); + sk1 [2] = Level2 (i1, ext1); if (currentLevel > 2) sk1 [3] = Uni.Level3 (i1); if (currentLevel > 3) special1 = Uni.HasSpecialWeight ((char) i1); + if (cat1 > 1) + previousChar = i1; } - Contraction ct2 = GetContraction (s2, idx2, end2); - if (ct2 != null) { + Contraction ct2 = null; + if (ext2 == ExtenderType.None) + ct2 = GetContraction (s2, idx2, end2); + + if (sk2 != null) + idx2++; + else if (ct2 != null) { idx2 += ct2.Source.Length; if (ct2.SortKey != null) { sk2 = charSortKey2; for (int i = 0; i < ct2.SortKey.Length; i++) sk2 [i] = ct2.SortKey [i]; + previousChar2 = -1; + previousSortKey2 = sk2; } else if (escape2.Source == null) { escape2.Source = s2; escape2.Start = start2; escape2.Index = cur2 + ct2.Source.Length; escape2.End = end2; + escape2.Optional = quickCheckPos2; s2 = ct2.Replacement; idx2 = 0; start2 = 0; end2 = s2.Length; + quickCheckPos2 = 0; continue; } } @@ -938,11 +913,13 @@ Console.WriteLine (" -> '{0}'", c.Replacement); sk2 [0] = cat2; sk2 [1] = Level1 (i2); if (!ignoreNonSpace && currentLevel > 1) - sk2 [2] = Level2 (i2); + sk2 [2] = Level2 (i2, ext2); if (currentLevel > 2) sk2 [3] = Uni.Level3 (i2); if (currentLevel > 3) special2 = Uni.HasSpecialWeight ((char) i2); + if (cat2 > 1) + previousChar2 = i2; idx2++; } @@ -958,7 +935,7 @@ Console.WriteLine (" -> '{0}'", c.Replacement); if (sk1 [2] == 0) sk1 [2] = 2; sk1 [2] = (byte) (sk1 [2] + - Level2 (s1 [idx1])); + Level2 (s1 [idx1], ExtenderType.None)); idx1++; } @@ -969,7 +946,7 @@ Console.WriteLine (" -> '{0}'", c.Replacement); if (sk2 [2] == 0) sk2 [2] = 2; sk2 [2] = (byte) (sk2 [2] + - Level2 (s2 [idx2])); + Level2 (s2 [idx2], ExtenderType.None)); idx2++; } } @@ -1008,8 +985,8 @@ Console.WriteLine (" -> '{0}'", c.Replacement); !Uni.IsJapaneseSmallLetter ((char) i1), !Uni.IsJapaneseSmallLetter ((char) i2)); ret = ret != 0 ? ret : - Uni.GetJapaneseDashType ((char) i1) - - Uni.GetJapaneseDashType ((char) i2); + ToDashTypeValue (ext1) - + ToDashTypeValue (ext2); ret = ret != 0 ? ret : CompareFlagPair ( Uni.IsHiragana ((char) i1), Uni.IsHiragana ((char) i2)); @@ -1034,22 +1011,44 @@ Console.WriteLine (" -> '{0}'", c.Replacement); break; if (!Uni.IsIgnorableNonSpacing (s2 [idx2])) break; - finalResult = Level2 (FilterOptions ((s1 [idx1]))) - Level2 (FilterOptions (s2 [idx2])); + finalResult = Level2 (FilterOptions ((s1 [idx1])), ext1) - Level2 (FilterOptions (s2 [idx2]), ext2); if (finalResult != 0) break; idx1++; idx2++; + // they should work only for the first character + ext1 = ExtenderType.None; + ext2 = ExtenderType.None; } } + if (currentLevel == 1 && finalResult != 0) { + while (idx1 < end1) + if (Uni.IsIgnorableNonSpacing (s1 [idx1])) + idx1++; + while (idx2 < end2) + if (Uni.IsIgnorableNonSpacing (s2 [idx2])) + idx2++; + } // we still have to handle level 5 if (finalResult == 0) { finalResult = lv5At1 - lv5At2; if (finalResult == 0) finalResult = lv5Value1 - lv5Value2; } + if (finalResult == 0) { + if (idx2 == end2) + targetConsumed = true; + if (idx1 == end1) + sourceConsumed = true; + } return idx1 != end1 ? 1 : idx2 == end2 ? finalResult : -1; } + int CompareFlagPair (bool b1, bool b2) + { + return b1 == b2 ? 0 : b1 ? 1 : -1; + } + #endregion #region IsPrefix() and IsSuffix() @@ -1062,140 +1061,17 @@ Console.WriteLine (" -> '{0}'", c.Replacement); public bool IsPrefix (string s, string target, int start, int length, CompareOptions opt) { SetOptions (opt); - return IsPrefix (s, target, start, length); - } - - // returns the consumed length in positive number, or -1 if - // target was not a prefix. - bool IsPrefix (string s, string target, int start, int length) - { - // quick check : simple codepoint comparison - if (s.Length >= target.Length) { - int si = start; - for (int i = 0; si < length && i < target.Length; i++, si++) - if (s [si] != target [i]) - break; - if (si == start + target.Length) - return true; - } - - escapedSourceIndex = -1; - return IsPrefixInternal (s, target, start, length); - } - - bool IsPrefixInternal (string s, string target, int start, int length) - { - int si = start; - int end = start + length; - int ti = 0; - string source = s; - - while (ti < target.Length) { - if (IsIgnorable (target [ti])) { - ti++; - continue; - } - if (si >= end) { - if (s == source) - break; - s = source; - si = escapedSourceIndex; - end = start + length; - escapedSourceIndex = -1; - continue; - } - if (IsIgnorable (s [si])) { - si++; - continue; - } - - // Check contraction for target. - Contraction ctt = GetContraction (target, ti, target.Length); - if (ctt != null) { - ti += ctt.Source.Length; - if (ctt.SortKey != null) { - int ret = GetMatchLength (ref s, ref si, ref end, -1, ctt.SortKey, true); - if (ret < 0) - return false; - si += ret; - } else { - string r = ctt.Replacement; - int i = 0; - while (i < r.Length && si < end) { - int ret = GetMatchLength (ref s, ref si, ref end, r [i]); - if (ret < 0) - return false; - si += ret; - i++; - } - if (i < r.Length && si >= end) - return false; - } - } - else { - int ret = GetMatchLength (ref s, ref si, ref end, target [ti]); - if (ret < 0) - return false; - si += ret; - ti++; - } - } - if (si == end) { - // All codepoints in the compared range - // matches. In that case, what matters - // is whether the remaining part of - // "target" is ignorable or not. - while (ti < target.Length) - if (!IsIgnorable (target [ti++])) - return false; - return true; - } - return true; + return IsPrefix (s, target, start, length, + (opt & CompareOptions.StringSort) != 0, true); } - // WARNING: Don't invoke it outside IsPrefix(). - int GetMatchLength (ref string s, ref int idx, ref int end, char target) + public bool IsPrefix (string s, string target, int start, int length, bool stringSort, bool skipHeadingExtenders) { - int it = FilterOptions ((int) target); - charSortKey [0] = Category (it); - charSortKey [1] = Level1 (it); - if (!ignoreNonSpace) - charSortKey [2] = Level2 (it); - charSortKey [3] = Uni.Level3 (it); - - return GetMatchLength (ref s, ref idx, ref end, it, charSortKey, !Uni.HasSpecialWeight ((char) it)); - } - - // WARNING: Don't invoke it outside IsPrefix(). - // returns consumed source length (mostly 1, source length in case of contraction) - int GetMatchLength (ref string s, ref int idx, ref int end, int it, byte [] sortkey, bool noLv4) - { - Contraction ct = null; - // If there is already expansion, then it should not - // process further expansions. - if (escapedSourceIndex < 0) - ct = GetContraction (s, idx, end); - if (ct != null) { - if (ct.SortKey != null) { - if (!noLv4) - return -1; - for (int i = 0; i < ct.SortKey.Length; i++) - if (sortkey [i] != ct.SortKey [i]) - return -1; - return ct.Source.Length; - } else { - escapedSourceIndex = idx + ct.Source.Length; - s = ct.Replacement; - idx = 0; - end = s.Length; - return GetMatchLength (ref s, ref idx, ref end, it, sortkey, noLv4); - } - } else { - // primitive comparison - if (Compare (s [idx], it, sortkey) != 0) - return -1; - return 1; - } + bool consumed, dummy; + int ret = CompareInternal (s, start, length, + target, 0, target.Length, stringSort, + out consumed, out dummy, skipHeadingExtenders); + return consumed; } // IsSuffix() @@ -1207,8 +1083,6 @@ Console.WriteLine (" -> '{0}'", c.Replacement); public bool IsSuffix (string s, string target, int start, int length, CompareOptions opt) { - SetOptions (opt); - // quick check : simple codepoint comparison if (s.Length >= target.Length) { int si = start; @@ -1220,188 +1094,96 @@ Console.WriteLine (" -> '{0}'", c.Replacement); return true; } - escapedSourceIndex = -1; - return IsSuffix (s, target, start, length); + SetOptions (opt); + return IsSuffix (s, target, start, length, + (opt & CompareOptions.StringSort) != 0); } - bool IsSuffix (string s, string target, int start, int length) + bool IsSuffix (string s, string t, int start, int length, + bool stringSort) { + int tstart = 0; + for (;tstart < t.Length; tstart++) + if (!IsIgnorable (t [tstart])) + break; + if (tstart == t.Length) + return true; // as if target is String.Empty. + +#if false +// This is still not working. If it is not likely to get working, then just remove it. int si = start; - int ti = target.Length - 1; - string source = s; - int end = start - length + 1; + int send = start - length; + int ti = t.Length - 1; + int tend = -1; - while (ti >= 0) { - if (IsIgnorable (target [ti])) { - ti--; - continue; - } - if (si < 0) { - if (s == source) + int sStep = start + 1; + int tStep = t.Length; + bool sourceConsumed, targetConsumed; + while (true) { + for (; send < si; si--) + if (!IsIgnorable (s [si])) break; - s = source; - si = escapedSourceIndex; - escapedSourceIndex = -1; - continue; - } - if (IsIgnorable (s [si])) { - si--; - continue; - } - - // Check contraction for target. - Contraction ctt = GetTailContraction (target, ti, 0); - if (ctt != null) { - ti -= ctt.Source.Length; - if (ctt.SortKey != null) { - int ret = GetMatchLengthBack (ref s, ref si, ref end, -1, ctt.SortKey, true); - if (ret < 0) - return false; - si -= ret; - } else { - string r = ctt.Replacement; - int i = r.Length - 1; - while (i >= 0 && si >= end) { - int ret = GetMatchLengthBack (ref s, ref si, ref end, r [i]); - if (ret < 0) - return false; - si -= ret; - i--; - } - if (i >= 0 && si < end) - return false; - } - } - else { - int ret = GetMatchLengthBack (ref s, ref si, ref end, target [ti]); - if (ret < 0) - return false; - si -= ret; - ti--; - } - } - if (si < end) { - // All codepoints in the compared range - // matches. In that case, what matters - // is whether the remaining part of - // "target" is ignorable or not. - while (ti >= 0) - if (!IsIgnorable (target [ti--])) - return false; - return true; + for (; tend < ti; ti--) + if (!IsIgnorable (t [ti])) + break; + if (tend == ti) + break; + for (; send < si; si--) + if (IsSafe (s [si])) + break; + for (; tend < ti; ti--) + if (IsSafe (t [ti])) + break; +Console.WriteLine ("==== {0} {1} {2} {3} {4} {5} {6} {7} {8}", s, si, send, length, t, ti, tstart, sStep - si, tStep - ti); + if (CompareInternal (s, si, sStep - si, + t, ti, tStep - ti, stringSort, + out targetConsumed, out sourceConsumed, + true) != 0) + return false; + if (send == si) + return false; + sStep = si; + tStep = ti; + si--; + ti--; } return true; - } - - // WARNING: Don't invoke it outside IsSuffix(). - int GetMatchLengthBack (ref string s, ref int idx, ref int end, char target) - { - int it = FilterOptions ((int) target); - charSortKey [0] = Category (it); - charSortKey [1] = Level1 (it); - if (!ignoreNonSpace) - charSortKey [2] = Level2 (it); - charSortKey [3] = Uni.Level3 (it); - - return GetMatchLengthBack (ref s, ref idx, ref end, it, charSortKey, !Uni.HasSpecialWeight ((char) it)); - } - - // WARNING: Don't invoke it outside IsSuffix(). - // returns consumed source length (mostly 1, source length in case of contraction) - int GetMatchLengthBack (ref string s, ref int idx, ref int end, int it, byte [] sortkey, bool noLv4) - { - Contraction ct = null; - // If there is already expansion, then it should not - // process further expansions. - if (escapedSourceIndex < 0) - ct = GetTailContraction (s, idx, end); - if (ct != null) { - if (ct.SortKey != null) { - if (!noLv4) - return -1; - for (int i = 0; i < ct.SortKey.Length; i++) - if (sortkey [i] != ct.SortKey [i]) - return -1; - return ct.Source.Length; - } else { - escapedSourceIndex = idx - ct.Source.Length; - s = ct.Replacement; - idx = s.Length - 1; - end = 0; - return GetMatchLength (ref s, ref idx, ref end, it, sortkey, noLv4); +#else + // FIXME: it is not efficient for very long target. + bool sourceConsumed, targetConsumed; + int mismatchCount = 0; + for (int i = 0; i < length; i++) { + escape1.Source = escape2.Source = null; + previousSortKey = previousSortKey2 = null; + previousChar = previousChar2 = -1; + + int ret = CompareInternal (s, start - i, i + 1, + t, tstart, t.Length - tstart, + stringSort, out targetConsumed, + out sourceConsumed, true); + if (ret == 0) + return true; + if (!sourceConsumed && targetConsumed) + return false; + if (!targetConsumed) { + mismatchCount++; + if (mismatchCount > Uni.MaxExpansionLength) + // The largest length of an + // expansion is 3, so if the + // target was not consumed more + // than 3 times, then the tail + // character does not match. + return false; } - } else { - // primitive comparison - if (Compare (s [idx], it, sortkey) != 0) - return -1; - return 1; - } - } - - // Common use methods - - // returns comparison result. - private int Compare (char src, int ct, byte [] sortkey) - { - // char-by-char comparison. - int cs = FilterOptions (src); - if (cs == ct) - return 0; - // lv.1 to 3 - int ret = Category (cs) - Category (ct); - if (ret != 0) - return ret; - ret = Level1 (cs) - Level1 (ct); - if (ret != 0) - return ret; - if (!ignoreNonSpace) { - ret = Level2 (cs) - Level2 (ct); - if (ret != 0) - return ret; } - ret = Uni.Level3 (cs) - Uni.Level3 (ct); - if (ret != 0) - return ret; - // lv.4 (only when required). No need to check cj coz - // there is no pair of characters that has the same - // primary level and differs here. - if (!Uni.HasSpecialWeight (src)) - return 0; - char target = (char) ct; - ret = CompareFlagPair ( - !Uni.IsJapaneseSmallLetter (src), - !Uni.IsJapaneseSmallLetter (target)); - if (ret != 0) - return ret; - ret = Uni.GetJapaneseDashType (src) - - Uni.GetJapaneseDashType (target); - if (ret != 0) - return ret; - ret = CompareFlagPair (Uni.IsHiragana (src), - Uni.IsHiragana (target)); - if (ret != 0) - return ret; - ret = CompareFlagPair (!IsHalfKana (src), - !IsHalfKana (target)); - return ret; - } - - int CompareFlagPair (bool b1, bool b2) - { - return b1 == b2 ? 0 : b1 ? 1 : -1; + return false; +#endif } #endregion #region IndexOf() / LastIndexOf() - // IndexOf (string, string, CompareOptions) - // IndexOf (string, string, int, int, CompareOptions) - // IndexOf (string, char, int, int, CompareOptions) - // IndexOfPrimitiveChar (string, int, int, char) - // IndexOfSortKey (string, int, int, byte[], char, int, bool) - // IndexOf (string, string, int, int) - public int IndexOf (string s, string target, CompareOptions opt) { return IndexOf (s, target, 0, s.Length, opt); @@ -1410,7 +1192,8 @@ Console.WriteLine (" -> '{0}'", c.Replacement); public int IndexOf (string s, string target, int start, int length, CompareOptions opt) { SetOptions (opt); - return IndexOf (s, target, start, length); + return IndexOf (s, target, start, length, + (opt & CompareOptions.StringSort) != 0); } public int IndexOf (string s, char target, CompareOptions opt) @@ -1426,33 +1209,32 @@ Console.WriteLine (" -> '{0}'", c.Replacement); Contraction ct = GetContraction (target); if (ct != null) { if (ct.Replacement != null) - return IndexOfPrimitiveChar (s, start, length, ct.Replacement [0]); + return IndexOf (s, ct.Replacement, start, length, + (opt & CompareOptions.StringSort) != 0); else return IndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true); + } else { + int ti = FilterOptions ((int) target); + charSortKeyIndexTarget [0] = Category (ti); + charSortKeyIndexTarget [1] = Level1 (ti); + if (!ignoreNonSpace) + charSortKeyIndexTarget [2] = + Level2 (ti, ExtenderType.None); + charSortKeyIndexTarget [3] = Uni.Level3 (ti); + return IndexOfSortKey (s, start, length, + charSortKeyIndexTarget, target, ti, + !Uni.HasSpecialWeight ((char) ti)); } - else - return IndexOfPrimitiveChar (s, start, length, target); - } - - // Searches target char w/o checking contractions - int IndexOfPrimitiveChar (string s, int start, int length, char target) - { - int ti = FilterOptions ((int) target); - charSortKey [0] = Category (ti); - charSortKey [1] = Level1 (ti); - if (!ignoreNonSpace) - charSortKey [2] = Level2 (ti); - charSortKey [3] = Uni.Level3 (ti); - return IndexOfSortKey (s, start, length, charSortKey, target, ti, !Uni.HasSpecialWeight ((char) ti)); } // Searches target byte[] keydata int IndexOfSortKey (string s, int start, int length, byte [] sortkey, char target, int ti, bool noLv4) { int end = start + length; - for (int idx = start; idx < end; idx++) { + int idx = start; + while (idx < end) { int cur = idx; - if (Matches (s, ref idx, end, ti, target, sortkey, noLv4, false)) + if (MatchesForward (s, ref idx, end, ti, sortkey, noLv4)) return cur; } return -1; @@ -1460,7 +1242,7 @@ Console.WriteLine (" -> '{0}'", c.Replacement); // Searches string. Search head character (or keydata when // the head is contraction sortkey) and try IsPrefix(). - int IndexOf (string s, string target, int start, int length) + int IndexOf (string s, string target, int start, int length, bool stringSort) { int tidx = 0; for (; tidx < target.Length; tidx++) @@ -1469,22 +1251,54 @@ Console.WriteLine (" -> '{0}'", c.Replacement); if (tidx == target.Length) return start; Contraction ct = GetContraction (target, tidx, target.Length - tidx); - byte [] sortkey = ct != null ? ct.SortKey : null; string replace = ct != null ? ct.Replacement : null; + byte [] sk = replace == null ? charSortKeyIndexTarget : null; + bool noLv4 = true; + char tc = char.MinValue; + int ti = -1; + if (ct != null && sk != null) { + for (int i = 0; i < ct.SortKey.Length; i++) + sk [i] = ct.SortKey [i]; + } else if (sk != null) { + tc = target [tidx]; + ti = FilterOptions (target [tidx]); + sk [0] = Category (ti); + sk [1] = Level1 (ti); + if (!ignoreNonSpace) + sk [2] = Level2 (ti, ExtenderType.None); + sk [3] = Uni.Level3 (ti); + noLv4 = !Uni.HasSpecialWeight ((char) ti); + } + if (sk != null) { + for (tidx++; tidx < target.Length; tidx++) { + if (Category (target [tidx]) != 1) + break; + if (sk [2] == 0) + sk [2] = 2; + sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None)); + } + } + do { int idx = 0; - if (sortkey != null) - idx = IndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true); - else if (replace != null) - idx = IndexOf (s, replace, start, length); + if (replace != null) + idx = IndexOf (s, replace, start, length, stringSort); else - idx = IndexOfPrimitiveChar (s, start, length, target [tidx]); + idx = IndexOfSortKey (s, start, length, sk, tc, ti, noLv4); if (idx < 0) return -1; - if (IsPrefix (s, target, idx, length - (idx - start))) + length -= idx - start; + start = idx; + if (IsPrefix (s, target, start, length, stringSort, false)) return idx; - start++; - length--; + Contraction cts = GetContraction (s, start, length); + if (cts != null) { + start += cts.Source.Length; + length -= cts.Source.Length; + } else { + start++; + length--; + } } while (length > 0); return -1; } @@ -1502,7 +1316,8 @@ Console.WriteLine (" -> '{0}'", c.Replacement); public int LastIndexOf (string s, string target, int start, int length, CompareOptions opt) { SetOptions (opt); - return LastIndexOf (s, target, start, length); + return LastIndexOf (s, target, start, length, + (opt & CompareOptions.StringSort) != 0); } public int LastIndexOf (string s, char target, CompareOptions opt) @@ -1518,34 +1333,29 @@ Console.WriteLine (" -> '{0}'", c.Replacement); Contraction ct = GetContraction (target); if (ct != null) { if (ct.Replacement != null) - return LastIndexOfPrimitiveChar (s, start, length, ct.Replacement [0]); + return LastIndexOf (s, ct.Replacement, start, length, (opt & CompareOptions.StringSort) != 0); else - return LastIndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true); + return LastIndexOfSortKey (s, start, start, length, ct.SortKey, char.MinValue, -1, true); + } + else { + int ti = FilterOptions ((int) target); + charSortKeyIndexTarget [0] = Category (ti); + charSortKeyIndexTarget [1] = Level1 (ti); + if (!ignoreNonSpace) + charSortKeyIndexTarget [2] = Level2 (ti, ExtenderType.None); + charSortKeyIndexTarget [3] = Uni.Level3 (ti); + return LastIndexOfSortKey (s, start, start, length, charSortKeyIndexTarget, target, ti, !Uni.HasSpecialWeight ((char) ti)); } - else - return LastIndexOfPrimitiveChar (s, start, length, target); - } - - // Searches target char w/o checking contractions - int LastIndexOfPrimitiveChar (string s, int start, int length, char target) - { - int ti = FilterOptions ((int) target); - charSortKey [0] = Category (ti); - charSortKey [1] = Level1 (ti); - if (!ignoreNonSpace) - charSortKey [2] = Level2 (ti); - charSortKey [3] = Uni.Level3 (ti); - return LastIndexOfSortKey (s, start, length, charSortKey, target, ti, !Uni.HasSpecialWeight ((char) ti)); } // Searches target byte[] keydata - int LastIndexOfSortKey (string s, int start, int length, byte [] sortkey, char target, int ti, bool noLv4) + int LastIndexOfSortKey (string s, int start, int orgStart, int length, byte [] sortkey, char target, int ti, bool noLv4) { int end = start - length; - - for (int idx = start; idx > end; idx--) { + int idx = start; + while (idx > end) { int cur = idx; - if (Matches (s, ref idx, end, ti, target, sortkey, noLv4, true)) + if (MatchesBackward (s, ref idx, end, orgStart, ti, sortkey, noLv4)) return cur; } return -1; @@ -1553,7 +1363,7 @@ Console.WriteLine (" -> '{0}'", c.Replacement); // Searches string. Search head character (or keydata when // the head is contraction sortkey) and try IsPrefix(). - int LastIndexOf (string s, string target, int start, int length) + int LastIndexOf (string s, string target, int start, int length, bool stringSort) { int orgStart = start; int tidx = 0; @@ -1563,79 +1373,144 @@ Console.WriteLine (" -> '{0}'", c.Replacement); if (tidx == target.Length) return start; Contraction ct = GetContraction (target, tidx, target.Length - tidx); - byte [] sortkey = ct != null ? ct.SortKey : null; string replace = ct != null ? ct.Replacement : null; + byte [] sk = replace == null ? charSortKeyIndexTarget : null; + + bool noLv4 = true; + char tc = char.MinValue; + int ti = -1; + if (ct != null && sk != null) { + for (int i = 0; i < ct.SortKey.Length; i++) + sk [i] = ct.SortKey [i]; + } else if (sk != null) { + tc = target [tidx]; + ti = FilterOptions (target [tidx]); + sk [0] = Category (ti); + sk [1] = Level1 (ti); + if (!ignoreNonSpace) + sk [2] = Level2 (ti, ExtenderType.None); + sk [3] = Uni.Level3 (ti); + noLv4 = !Uni.HasSpecialWeight ((char) ti); + } + if (sk != null) { + for (tidx++; tidx < target.Length; tidx++) { + if (Category (target [tidx]) != 1) + break; + if (sk [2] == 0) + sk [2] = 2; + sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None)); + } + } do { int idx = 0; - if (sortkey != null) - idx = LastIndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true); - else if (replace != null) - idx = LastIndexOf (s, replace, start, length); - else - idx = LastIndexOfPrimitiveChar (s, start, length, target [tidx]); + if (replace != null) + idx = LastIndexOf (s, replace, start, length, stringSort); + else + idx = LastIndexOfSortKey (s, start, orgStart, length, sk, tc, ti, noLv4); if (idx < 0) return -1; - if (IsPrefix (s, target, idx, orgStart - idx + 1)) { + length -= start - idx; + start = idx; + + if (IsPrefix (s, target, idx, orgStart - idx + 1, stringSort, false)) { for (;idx < orgStart; idx++) if (!IsIgnorable (s [idx])) break; return idx; } - length--; - start--; + Contraction cts = GetContraction (s, idx, orgStart - idx + 1); + if (cts != null) { + start -= cts.Source.Length; + length -= cts.Source.Length; + } else { + start--; + length--; + } } while (length > 0); return -1; } - private bool Matches (string s, ref int idx, int end, int ti, char target, byte [] sortkey, bool noLv4, bool lastIndexOf) + private bool MatchesForward (string s, ref int idx, int end, int ti, byte [] sortkey, bool noLv4) { - switch (char.GetUnicodeCategory (s [idx])) { - case UnicodeCategory.PrivateUse: - case UnicodeCategory.Surrogate: - if (s [idx] != target) + int si = -1; + ExtenderType ext = GetExtenderType (s [idx]); + Contraction ct = null; + if (ext == ExtenderType.None) + ct = GetContraction (s, idx, end); + else if (previousChar < 0) { + if (previousSortKey == null) { + idx++; return false; - return true; + } + charSortKey = previousSortKey; } - - char sc = char.MinValue; - Contraction ct = GetContraction (s, idx, end); + else + si = FilterExtender (previousChar, ext); // if lv4 exists, it never matches contraction - if (ct != null && noLv4) { - if (lastIndexOf) - idx -= ct.Source.Length - 1; - else - idx += ct.Source.Length - 1; + if (ct != null) { + idx += ct.Source.Length; + if (!noLv4) + return false; if (ct.SortKey != null) { for (int i = 0; i < sortkey.Length; i++) - if (ct.SortKey [i] != sortkey [i]) - return false; - return true; + charSortKey [i] = sortkey [i]; + previousChar = -1; + previousSortKey = charSortKey; + } else { + // Here is the core of LAMESPEC + // described at the top of the source. + int dummy = 0; + return MatchesForward (ct.Replacement, ref dummy, + ct.Replacement.Length, ti, sortkey, noLv4); } - // Here is the core of LAMESPEC - // described at the top of the source. - sc = ct.Replacement [0]; + } else { + if (si < 0) + si = FilterOptions (s [idx]); + charSortKey [0] = Category (si); + charSortKey [1] = Level1 (si); + if (!ignoreNonSpace) + charSortKey [2] = Level2 (si, ext); + charSortKey [3] = Uni.Level3 (si); + if (charSortKey [0] != 1) + previousChar = si; + idx++; + } + for (; idx < end; idx++) { + if (Category (s [idx]) != 1) + break; + if (ignoreNonSpace) + continue; + if (charSortKey [2] == 0) + charSortKey [2] = 2; + charSortKey [2] = (byte) (charSortKey [2] + + Level2 (s [idx], ExtenderType.None)); } - else - sc = s [idx]; - if (sc == target) - return true; - int si = FilterOptions ((int) sc); - if (Category (si) != sortkey [0] || - Level1 (si) != sortkey [1] || - !ignoreNonSpace && Level2 (si) != sortkey [2] || - Uni.Level3 (si) != sortkey [3]) + return MatchesPrimitive (charSortKey, si, ext, sortkey, ti, noLv4); + } + + private bool MatchesPrimitive (byte [] charSortKey, int si, ExtenderType ext, byte [] sortkey, int ti, bool noLv4) + { + if (charSortKey [0] != sortkey [0] || + charSortKey [1] != sortkey [1] || + (!ignoreNonSpace && charSortKey [2] != sortkey [2]) || + charSortKey [3] != sortkey [3]) return false; - if (noLv4 && !Uni.HasSpecialWeight ((char) si)) + if (noLv4 && (si < 0 || !Uni.HasSpecialWeight ((char) si))) return true; else if (noLv4) return false; + // Since target can never be an extender, if the source + // is an expander and it matters, then they never match. + if (!ignoreNonSpace && ext == ExtenderType.Conditional) + return false; if (Uni.IsJapaneseSmallLetter ((char) si) != Uni.IsJapaneseSmallLetter ((char) ti) || - Uni.GetJapaneseDashType ((char) si) != - Uni.GetJapaneseDashType ((char) ti) || + ToDashTypeValue (ext) != + // FIXME: we will have to specify correct value for target + ToDashTypeValue (ExtenderType.None) || !Uni.IsHiragana ((char) si) != !Uni.IsHiragana ((char) ti) || IsHalfKana ((char) si) != @@ -1644,6 +1519,88 @@ Console.WriteLine (" -> '{0}'", c.Replacement); return true; } + private bool MatchesBackward (string s, ref int idx, int end, int orgStart, int ti, byte [] sortkey, bool noLv4) + { + int cur = idx; + int si = -1; + ExtenderType ext = GetExtenderType (s [idx]); + // To handle extenders in source, we need to + // check next _primary_ character. + if (ext != ExtenderType.None) { + byte diacritical = 0; + for (int tmp = 0; ; tmp--) { + if (tmp < 0) // heading extender + return false; + if (IsIgnorable (s [tmp])) + continue; + int tmpi = FilterOptions (s [tmp]); + byte category = Category (tmpi); + if (category == 1) { + diacritical = Level2 (tmpi, ExtenderType.None); + continue; + } + si = FilterExtender (tmpi, ext); + charSortKey [0] = category; + charSortKey [1] = Level1 (si); + if (!ignoreNonSpace) + charSortKey [2] = Level2 (si, ext); + charSortKey [3] = Uni.Level3 (si); + if (ext != ExtenderType.Conditional && + diacritical != 0) + charSortKey [2] = + (charSortKey [2] == 0) ? + (byte) (diacritical + 2) : + diacritical; + break; + } + idx--; + } + Contraction ct = null; + if (ext == ExtenderType.None) + ct = GetContraction (s, idx, end); + // if lv4 exists, it never matches contraction + if (ct != null) { + idx -= ct.Source.Length; + if (!noLv4) + return false; + if (ct.SortKey != null) { + for (int i = 0; i < sortkey.Length; i++) + charSortKey [i] = sortkey [i]; + previousChar = -1; + previousSortKey = charSortKey; + } else { + // Here is the core of LAMESPEC + // described at the top of the source. + int dummy = ct.Replacement.Length - 1; + return MatchesBackward (ct.Replacement, + ref dummy, dummy, -1, ti, sortkey, noLv4); + } + } else if (ext == ExtenderType.None) { + if (si < 0) + si = FilterOptions (s [idx]); + charSortKey [0] = Category (si); + charSortKey [1] = Level1 (si); + if (!ignoreNonSpace) + charSortKey [2] = Level2 (si, ext); + charSortKey [3] = Uni.Level3 (si); + if (charSortKey [0] != 1) + previousChar = si; + idx--; + } + if (ext == ExtenderType.None) { + for (int tmp = cur + 1; tmp < orgStart; tmp++) { + if (Category (s [tmp]) != 1) + break; + if (ignoreNonSpace) + continue; + if (charSortKey [2] == 0) + charSortKey [2] = 2; + charSortKey [2] = (byte) (charSortKey [2] + + Level2 (s [tmp], ExtenderType.None)); + } + } + return MatchesPrimitive (charSortKey, si, ext, sortkey, ti, noLv4); + } #endregion } }