2 // SimpleCollator.cs : the core collator implementation
5 // Atsushi Enomoto <atsushi@ximian.com>
7 // Copyright (C) 2005 Novell, Inc (http://www.novell.com)
9 // Permission is hereby granted, free of charge, to any person obtaining
10 // a copy of this software and associated documentation files (the
11 // "Software"), to deal in the Software without restriction, including
12 // without limitation the rights to use, copy, modify, merge, publish,
13 // distribute, sublicense, and/or sell copies of the Software, and to
14 // permit persons to whom the Software is furnished to do so, subject to
15 // the following conditions:
17 // The above copyright notice and this permission notice shall be
18 // included in all copies or substantial portions of the Software.
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 // Here's a summary for supporting contractions, expansions and diacritical
33 // Diacritical mapping is a simple tailoring rule that "remaps" diacritical
34 // weight value from one to another. For now it applies to all range of
35 // characters, but at some stage we might need to limit the remapping ranges.
37 // A Contraction consists of a string (actually char[]) and a sortkey entry
38 // (i.e. byte[]). It indicates that a sequence of characters is interpreted
39 // as a single character that has the mapped sortkey value. There is no
40 // character which goes across "special rules". When the collator encountered
41 // such a sequence of characters, it just returns the sortkey value without
42 // further replacement.
44 // Since it is still likely to happen that a contraction sequence matches
45 // other character than the identical sequence (depending on CompareOptions
46 // and of course, defined sortkey value itself), comparison cannot be done
47 // at source char[] level.
49 // In IndexOf(), the first character in the target string or the target char
50 // itself is turned into sortkey bytes. If the character has a contraction and
51 // that is sortkey map, then it is used instead. If the contraction exists and
52 // that is replacement map, then the first character of the replacement string
53 // is searched instead. IndexOf() always searches only for the top character,
54 // and if it returned negative value, then it returns -1. Otherwise, it then
55 // tries IsPrefix() from that location. If it returns true, then it returns
58 // LAMESPEC: IndexOf() is lame as a whole API. It never matches in the middle
59 // of expansion and there is no proper way to return such indexes within
60 // a single int return value.
62 // For example, try below in .NET:
63 // IndexOf("\u00E6", "a")
64 // IndexOf("\u00E6", "e")
69 using System.Collections;
70 using System.Globalization;
72 using Uni = Mono.Globalization.Unicode.MSCompatUnicodeTable;
73 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
74 using COpt = System.Globalization.CompareOptions;
76 namespace Mono.Globalization.Unicode
78 internal class SimpleCollator
80 // this environment variable is for debugging quick check.
81 static bool QuickCheckDisabled =
82 Environment.GetEnvironmentVariable (
83 "MONO_COLLATION_QUICK_CHECK_DISABLED") == "yes";
85 unsafe internal struct Context
87 public Context (CompareOptions opt, byte* alwaysMatchFlags, byte* neverMatchFlags, byte* buffer1, byte* buffer2, byte* prev1, bool quickCheckPossible)
90 AlwaysMatchFlags = alwaysMatchFlags;
91 NeverMatchFlags = neverMatchFlags;
96 QuickCheckPossible = quickCheckPossible;
99 public readonly CompareOptions Option;
100 public readonly byte* NeverMatchFlags;
101 public readonly byte* AlwaysMatchFlags;
102 public byte* Buffer1;
103 public byte* Buffer2;
105 public byte* PrevSortKey;
106 public readonly bool QuickCheckPossible;
108 public void ClearPrevInfo ()
115 unsafe struct PreviousInfo
118 public byte* SortKey;
120 public PreviousInfo (bool dummy)
129 public string Source;
136 static SimpleCollator invariant =
137 new SimpleCollator (CultureInfo.InvariantCulture);
139 readonly TextInfo textInfo; // for ToLower().
140 readonly bool frenchSort;
141 unsafe readonly byte* cjkCatTable;
142 unsafe readonly byte* cjkLv1Table;
143 readonly CodePointIndexer cjkIndexer;
144 unsafe readonly byte* cjkLv2Table;
145 readonly CodePointIndexer cjkLv2Indexer;
147 readonly Contraction [] contractions;
148 readonly Level2Map [] level2Maps;
150 // This flag marks characters as "unsafe", where the character
151 // could be used as part of a contraction (whose length > 1).
152 readonly byte [] unsafeFlags;
154 const int UnsafeFlagLength = 0x300 / 8;
156 // readonly byte [] contractionFlags = new byte [16];
158 // This array is internally used inside IndexOf() to store
159 // "no need to check" ASCII characters.
161 // Now that it should be thread safe, this array is allocated
163 // byte [] neverMatchFlags = new byte [128 / 8];
165 #region .ctor() and split functions
167 public SimpleCollator (CultureInfo culture)
170 textInfo = culture.TextInfo;
173 SetCJKTable (culture, ref cjkIndexer,
174 ref cjkCatTable, ref cjkLv1Table,
175 ref cjkLv2Indexer, ref cjkLv2Table);
178 // Get tailoring info
179 TailoringInfo t = null;
180 for (CultureInfo ci = culture; ci.LCID != 127; ci = ci.Parent) {
181 t = Uni.GetTailoringInfo (ci.LCID);
185 if (t == null) // then use invariant
186 t = Uni.GetTailoringInfo (127);
188 frenchSort = t.FrenchSort;
189 Uni.BuildTailoringTables (culture, t, ref contractions,
191 unsafeFlags = new byte [UnsafeFlagLength];
192 foreach (Contraction c in contractions) {
193 if (c.Source.Length > 1)
194 foreach (char ch in c.Source)
195 unsafeFlags [(int) ch / 8 ]
196 |= (byte) (1 << ((int) ch & 7));
197 // for (int i = 0; i < c.Source.Length; i++) {
198 // int ch = c.Source [i];
201 // contractionFlags [ch / 8] |= (byte) (1 << (ch & 7));
205 foreach (Contraction c in invariant.contractions) {
206 if (c.Source.Length > 1)
207 foreach (char ch in c.Source)
208 unsafeFlags [(int) ch / 8 ]
209 |= (byte) (1 << ((int) ch & 7));
210 // for (int i = 0; i < c.Source.Length; i++) {
211 // int ch = c.Source [i];
214 // contractionFlags [ch / 8] |= (byte) (1 << (ch & 7));
218 // FIXME: Since tailorings are mostly for latin
219 // (and in some cases Cyrillic) characters, it would
220 // be much better for performance to store "start
221 // indexes" for > 370 (culture-specific letters).
224 // dump tailoring table
225 Console.WriteLine ("******** building table for {0} : contractions - {1} diacritical - {2}",
226 culture.LCID, contractions.Length, level2Maps.Length);
227 foreach (Contraction c in contractions) {
228 foreach (char cc in c.Source)
229 Console.Write ("{0:X4} ", (int) cc);
230 Console.WriteLine (" -> '{0}'", c.Replacement);
235 unsafe private void SetCJKTable (
236 CultureInfo culture, ref CodePointIndexer cjkIndexer,
237 ref byte* catTable, ref byte* lv1Table,
238 ref CodePointIndexer lv2Indexer, ref byte* lv2Table)
240 string name = GetNeutralCulture (culture).Name;
242 Uni.FillCJK (name, ref cjkIndexer, ref catTable,
243 ref lv1Table, ref lv2Indexer, ref lv2Table);
246 static CultureInfo GetNeutralCulture (CultureInfo info)
248 CultureInfo ret = info;
249 while (ret.Parent != null && ret.Parent.LCID != 127)
256 unsafe byte Category (int cp)
258 if (cp < 0x3000 || cjkCatTable == null)
259 return Uni.Category (cp);
260 int idx = cjkIndexer.ToIndex (cp);
261 return idx < 0 ? Uni.Category (cp) : cjkCatTable [idx];
264 unsafe byte Level1 (int cp)
266 if (cp < 0x3000 || cjkLv1Table == null)
267 return Uni.Level1 (cp);
268 int idx = cjkIndexer.ToIndex (cp);
269 return idx < 0 ? Uni.Level1 (cp) : cjkLv1Table [idx];
272 unsafe byte Level2 (int cp, ExtenderType ext)
274 if (ext == ExtenderType.Buggy)
276 else if (ext == ExtenderType.Conditional)
279 if (cp < 0x3000 || cjkLv2Table == null)
280 return Uni.Level2 (cp);
281 int idx = cjkLv2Indexer.ToIndex (cp);
282 byte ret = idx < 0 ? (byte) 0 : cjkLv2Table [idx];
285 ret = Uni.Level2 (cp);
286 if (level2Maps.Length == 0)
288 for (int i = 0; i < level2Maps.Length; i++) {
289 if (level2Maps [i].Source == ret)
290 return level2Maps [i].Replace;
291 else if (level2Maps [i].Source > ret)
297 static bool IsHalfKana (int cp, COpt opt)
299 return (opt & COpt.IgnoreWidth) != 0 ||
300 Uni.IsHalfWidthKana ((char) cp);
303 Contraction GetContraction (string s, int start, int end)
305 // int si = s [start];
306 // if (si < 128 && (contractionFlags [si / 8] & (1 << (si & 7))) == 0)
308 Contraction c = GetContraction (s, start, end, contractions);
309 if (c != null || lcid == 127)
311 return GetContraction (s, start, end, invariant.contractions);
314 Contraction GetContraction (string s, int start, int end, Contraction [] clist)
316 for (int i = 0; i < clist.Length; i++) {
317 Contraction ct = clist [i];
318 int diff = ct.Source [0] - s [start];
320 return null; // it's already sorted
323 char [] chars = ct.Source;
324 if (end - start < chars.Length)
327 for (int n = 0; n < chars.Length; n++)
328 if (s [start + n] != chars [n]) {
338 Contraction GetTailContraction (string s, int start, int end)
340 // int si = s [end - 1];
341 // if (si < 128 && (contractionFlags [si / 8] & (1 << (si & 7))) == 0)
343 Contraction c = GetTailContraction (s, start, end, contractions);
344 if (c != null || lcid == 127)
346 return GetTailContraction (s, start, end, invariant.contractions);
349 Contraction GetTailContraction (string s, int start, int end, Contraction [] clist)
351 if (start == end || end < -1 || start >= s.Length || s.Length <= end + 1)
352 throw new SystemException (String.Format ("MONO internal error. Failed to get TailContraction. start = {0} end = {1} string = '{2}'", start, end, s));
353 for (int i = 0; i < clist.Length; i++) {
354 Contraction ct = clist [i];
355 int diff = ct.Source [0] - s [end + 1];
357 return null; // it's already sorted
360 char [] chars = ct.Source;
363 if (chars.Length > start - end)
365 for (int n = 0; n < chars.Length; n++)
366 if (s [start - n] != chars [chars.Length - 1 - n]) {
376 Contraction GetContraction (char c)
378 Contraction ct = GetContraction (c, contractions);
379 if (ct != null || lcid == 127)
381 return GetContraction (c, invariant.contractions);
384 Contraction GetContraction (char c, Contraction [] clist)
386 for (int i = 0; i < clist.Length; i++) {
387 Contraction ct = clist [i];
388 if (ct.Source [0] > c)
389 return null; // it's already sorted
390 if (ct.Source [0] == c && ct.Source.Length == 1)
396 int FilterOptions (int i, COpt opt)
398 if ((opt & COpt.IgnoreWidth) != 0) {
399 int x = Uni.ToWidthCompat (i);
403 if ((opt & COpt.IgnoreCase) != 0)
404 i = textInfo.ToLower ((char) i);
405 if ((opt & COpt.IgnoreKanaType) != 0)
406 i = Uni.ToKanaTypeInsensitive (i);
418 ExtenderType GetExtenderType (int i)
420 // LAMESPEC: Windows expects true for U+3005, but
421 // sometimes it does not represent to repeat just
423 // Windows also expects true for U+3031 and U+3032,
424 // but they should *never* repeat one character.
426 // U+2015 becomes an extender only when it is Japanese
428 return lcid == 16 ? ExtenderType.Conditional :
431 if (i < 0x3005 || i > 0xFF70)
432 return ExtenderType.None;
437 return ExtenderType.Simple;
439 return ExtenderType.Conditional;
442 return ExtenderType.Voiced;
446 return ExtenderType.None;
448 case 0x3005: // LAMESPEC: see above.
449 return ExtenderType.Buggy;
450 case 0x3031: // LAMESPEC: see above.
451 case 0x3032: // LAMESPEC: see above.
454 return ExtenderType.Simple;
457 return ExtenderType.Voiced;
459 return ExtenderType.Conditional;
461 return ExtenderType.None;
465 static byte ToDashTypeValue (ExtenderType ext, COpt opt)
467 if ((opt & COpt.IgnoreNonSpace) != 0) // LAMESPEC: huh, why?
470 case ExtenderType.None:
472 case ExtenderType.Conditional:
479 int FilterExtender (int i, ExtenderType ext, COpt opt)
481 if (ext == ExtenderType.Conditional &&
482 Uni.HasSpecialWeight ((char) i)) {
483 bool half = IsHalfKana ((char) i, opt);
484 bool katakana = !Uni.IsHiragana ((char) i);
485 switch (Level1 (i) & 7) {
487 return half ? 0xFF71 : katakana ? 0x30A2 : 0x3042;
489 return half ? 0xFF72 : katakana ? 0x30A4 : 0x3044;
491 return half ? 0xFF73 : katakana ? 0x30A6 : 0x3046;
493 return half ? 0xFF74 : katakana ? 0x30A8 : 0x3048;
495 return half ? 0xFF75 : katakana ? 0x30AA : 0x304A;
501 static bool IsIgnorable (int i, COpt opt)
503 return Uni.IsIgnorable (i, (byte) (1 +
504 ((opt & COpt.IgnoreSymbols) != 0 ? 2 : 0) +
505 ((opt & COpt.IgnoreNonSpace) != 0 ? 4 : 0)));
511 return i / 8 >= unsafeFlags.Length ? true : (unsafeFlags [i / 8] & (1 << (i % 8))) == 0;
516 public SortKey GetSortKey (string s)
518 return GetSortKey (s, CompareOptions.None);
521 public SortKey GetSortKey (string s, CompareOptions options)
523 return GetSortKey (s, 0, s.Length, options);
526 public SortKey GetSortKey (string s, int start, int length, CompareOptions options)
528 SortKeyBuffer buf = new SortKeyBuffer (lcid);
529 buf.Initialize (options, lcid, s, frenchSort);
530 int end = start + length;
531 GetSortKey (s, start, end, buf, options);
532 return buf.GetResultAndReset ();
535 unsafe void GetSortKey (string s, int start, int end,
536 SortKeyBuffer buf, CompareOptions opt)
538 byte* prevbuf = stackalloc byte [4];
539 ClearBuffer (prevbuf, 4);
540 Context ctx = new Context (opt, null, null, null, null, prevbuf, false);
542 for (int n = start; n < end; n++) {
545 ExtenderType ext = GetExtenderType (i);
546 if (ext != ExtenderType.None) {
547 i = FilterExtender (ctx.PrevCode, ext, opt);
549 FillSortKeyRaw (i, ext, buf, opt);
550 else if (ctx.PrevSortKey != null) {
551 byte* b = ctx.PrevSortKey;
555 b [2] != 1 ? b [2] : Level2 (i, ext),
556 b [3] != 1 ? b [3] : Uni.Level3 (i));
558 // otherwise do nothing.
559 // (if the extender is the first char
560 // in the string, then just ignore.)
564 if (IsIgnorable (i, opt))
566 i = FilterOptions (i, opt);
568 Contraction ct = GetContraction (s, n, end);
570 if (ct.Replacement != null) {
571 GetSortKey (ct.Replacement, 0, ct.Replacement.Length, buf, opt);
573 byte* b = ctx.PrevSortKey;
574 for (int bi = 0; bi < ct.SortKey.Length; bi++)
575 b [bi] = ct.SortKey [bi];
579 b [2] != 1 ? b [2] : Level2 (i, ext),
580 b [3] != 1 ? b [3] : Uni.Level3 (i));
583 n += ct.Source.Length - 1;
586 if (!Uni.IsIgnorableNonSpacing (i))
588 FillSortKeyRaw (i, ExtenderType.None, buf, opt);
593 void FillSortKeyRaw (int i, ExtenderType ext,
594 SortKeyBuffer buf, CompareOptions opt)
596 if (0x3400 <= i && i <= 0x4DB5) {
597 int diff = i - 0x3400;
598 buf.AppendCJKExtension (
599 (byte) (0x10 + diff / 254),
600 (byte) (diff % 254 + 2));
604 UnicodeCategory uc = char.GetUnicodeCategory ((char) i);
606 case UnicodeCategory.PrivateUse:
607 int diff = i - 0xE000;
609 (byte) (0xE5 + diff / 254),
610 (byte) (diff % 254 + 2),
614 case UnicodeCategory.Surrogate:
615 FillSurrogateSortKeyRaw (i, buf);
619 byte level2 = Level2 (i, ext);
620 if (Uni.HasSpecialWeight ((char) i)) {
621 byte level1 = Level1 (i);
627 Uni.IsJapaneseSmallLetter ((char) i),
628 ToDashTypeValue (ext, opt),
629 !Uni.IsHiragana ((char) i),
630 IsHalfKana ((char) i, opt)
632 if ((opt & COpt.IgnoreNonSpace) == 0 && ext == ExtenderType.Voiced)
633 // Append voice weight
634 buf.AppendNormal (1, 1, 1, 0);
644 void FillSurrogateSortKeyRaw (int i, SortKeyBuffer buf)
653 lower = (byte) ((i == 0xD800) ? 0x3E : 0x3F);
654 } else if (0xD840 <= i && i < 0xD880) {
658 } else if (0xDB80 <= i && i < 0xDC00) {
659 diffbase = 0xDB80 - 0x40;
663 diffbase = 0xDC00 - 0xF8 + 2;
667 int diff = i - diffbase;
670 (byte) (segment + diff / 254),
671 (byte) (diff % 254 + 2),
680 public int Compare (string s1, string s2)
682 return Compare (s1, s2, CompareOptions.None);
685 public int Compare (string s1, string s2, CompareOptions options)
687 return Compare (s1, 0, s1.Length, s2, 0, s2.Length, options);
690 private int CompareOrdinal (string s1, int idx1, int len1,
691 string s2, int idx2, int len2)
693 int min = len1 < len2 ? len1 : len2;
694 int end1 = idx1 + min;
695 int end2 = idx2 + min;
696 if (idx1 < 0 || idx2 < 0 || end1 > s1.Length || end2 > s2.Length)
697 throw new SystemException (String.Format ("CompareInfo Internal Error: Should not happen. {0} {1} {2} {3} {4} {5}", idx1, idx2, len1, len2, s1.Length, s2.Length));
698 for (int i1 = idx1, i2 = idx2;
699 i1 < end1 && i2 < end2; i1++, i2++)
700 if (s1 [i1] != s2 [i2])
701 return s1 [i1] - s2 [i2];
702 return len1 == len2 ? 0 :
703 len1 == min ? - 1 : 1;
706 // mostly equivalent to CompareOrdinal, but the return value is
707 // not based on codepoints.
708 private int CompareQuick (string s1, int idx1, int len1,
709 string s2, int idx2, int len2, out bool sourceConsumed,
710 out bool targetConsumed, bool immediateBreakup)
712 sourceConsumed = false;
713 targetConsumed = false;
714 int min = len1 < len2 ? len1 : len2;
715 int end1 = idx1 + min;
716 int end2 = idx2 + min;
717 if (idx1 < 0 || idx2 < 0 || end1 > s1.Length || end2 > s2.Length)
718 throw new SystemException (String.Format ("CompareInfo Internal Error: Should not happen. {0} {1} {2} {3} {4} {5}", idx1, idx2, len1, len2, s1.Length, s2.Length));
719 for (int i1 = idx1, i2 = idx2;
720 i1 < end1 && i2 < end2; i1++, i2++)
721 if (s1 [i1] != s2 [i2]) {
722 if (immediateBreakup)
724 int ret = Category (s1 [i1]) - Category (s2 [i2]);
726 ret = Level1 (s1 [i1]) - Level1 (s2 [i2]);
729 ret = Uni.Level3 (s1 [i1]) - Uni.Level3 (s2 [i2]);
731 throw new SystemException (String.Format ("CompareInfo Internal Error: Should not happen. '{0}' {2} {3} '{1}' {4} {5}", s1, s2, idx1, end1, idx2, end2));
734 sourceConsumed = len1 <= len2;
735 targetConsumed = len1 >= len2;
736 return len1 == len2 ? 0 :
737 len1 == min ? - 1 : 1;
740 private int CompareOrdinalIgnoreCase (string s1, int idx1, int len1,
741 string s2, int idx2, int len2)
743 int min = len1 < len2 ? len1 : len2;
744 int end1 = idx1 + min;
745 int end2 = idx2 + min;
746 if (idx1 < 0 || idx2 < 0 || end1 > s1.Length || end2 > s2.Length)
747 throw new SystemException (String.Format ("CompareInfo Internal Error: Should not happen. {0} {1} {2} {3} {4} {5}", idx1, idx2, len1, len2, s1.Length, s2.Length));
748 TextInfo ti = invariant.textInfo;
749 for (int i1 = idx1, i2 = idx2;
750 i1 < end1 && i2 < end2; i1++, i2++)
751 if (ti.ToLower (s1 [i1]) != ti.ToLower (s2 [i2]))
752 return ti.ToLower (s1 [i1]) - ti.ToLower (s2 [i2]);
753 return len1 == len2 ? 0 :
754 len1 == min ? - 1 : 1;
757 public unsafe int Compare (string s1, int idx1, int len1,
758 string s2, int idx2, int len2, CompareOptions options)
760 // quick equality check
761 if (idx1 == idx2 && len1 == len2 &&
762 Object.ReferenceEquals (s1, s2))
764 // FIXME: this should be done inside Compare() at
766 // int ord = CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
769 if (options == CompareOptions.Ordinal)
770 return CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
772 if (options == CompareOptions.OrdinalIgnoreCase)
773 return CompareOrdinalIgnoreCase (s1, idx1, len1, s2, idx2, len2);
776 #if false // stable easy version, depends on GetSortKey().
777 SortKey sk1 = GetSortKey (s1, idx1, len1, options);
778 SortKey sk2 = GetSortKey (s2, idx2, len2, options);
779 byte [] d1 = sk1.KeyData;
780 byte [] d2 = sk2.KeyData;
781 int len = d1.Length > d2.Length ? d2.Length : d1.Length;
782 for (int i = 0; i < len; i++)
783 if (d1 [i] != d2 [i])
784 return d1 [i] < d2 [i] ? -1 : 1;
785 return d1.Length == d2.Length ? 0 : d1.Length < d2.Length ? -1 : 1;
787 byte* sk1 = stackalloc byte [4];
788 byte* sk2 = stackalloc byte [4];
789 ClearBuffer (sk1, 4);
790 ClearBuffer (sk2, 4);
791 Context ctx = new Context (options, null, null, sk1, sk2, null,
792 QuickCheckPossible (s1, idx1, idx1 + len1, s2, idx2, idx2 + len2));
795 int ret = CompareInternal (s1, idx1, len1, s2, idx2, len2, out dummy, out dummy2, true, false, ref ctx);
796 return ret == 0 ? 0 : ret < 0 ? -1 : 1;
800 unsafe void ClearBuffer (byte* buffer, int size)
802 for (int i = 0; i < size; i++)
806 bool QuickCheckPossible (string s1, int idx1, int end1,
807 string s2, int idx2, int end2)
809 if (QuickCheckDisabled)
811 // if (s1.Length > 100 || s2.Length > 100)
813 for (int i = idx1; i < end1; i++)
814 if (s1 [i] < 0x20 && (s1 [i] < '\x9' || s1 [i] > '\xD') || s1 [i] >= 0x80 || s1 [i] == '-' || s1 [i] == '\'')
816 for (int i = idx2; i < end2; i++)
817 if (s2 [i] < 0x20 && (s2 [i] < '\x9' || s2 [i] > '\xD') || s2 [i] >= 0x80 || s2 [i] == '-' || s2 [i] == '\'')
822 unsafe int CompareInternal (string s1, int idx1, int len1, string s2,
824 out bool targetConsumed, out bool sourceConsumed,
825 bool skipHeadingExtenders, bool immediateBreakup,
828 COpt opt = ctx.Option;
831 int end1 = idx1 + len1;
832 int end2 = idx2 + len2;
833 targetConsumed = false;
834 sourceConsumed = false;
835 PreviousInfo prev2 = new PreviousInfo (false);
837 if (opt == CompareOptions.None && ctx.QuickCheckPossible)
838 return CompareQuick (s1, idx1, len1, s2, idx2, len2, out sourceConsumed, out targetConsumed, immediateBreakup);
840 // It holds final result that comes from the comparison
841 // at level 2 or lower. Even if Compare() found the
842 // difference at level 2 or lower, it still has to
843 // continue level 1 comparison. FinalResult is used
844 // when there was no further differences.
846 // It holds the comparison level to do. It starts from
847 // 5, and becomes 1 when we do primary-only comparison.
848 int currentLevel = 5;
855 // Skip heading extenders
856 if (skipHeadingExtenders) {
857 for (; idx1 < end1; idx1++)
858 if (GetExtenderType (s1 [idx1]) == ExtenderType.None)
860 for (; idx2 < end2; idx2++)
861 if (GetExtenderType (s2 [idx2]) == ExtenderType.None)
865 ExtenderType ext1 = ExtenderType.None;
866 ExtenderType ext2 = ExtenderType.None;
868 int quickCheckPos1 = idx1;
869 int quickCheckPos2 = idx2;
870 bool stringSort = (opt & COpt.StringSort) != 0;
871 bool ignoreNonSpace = (opt & COpt.IgnoreNonSpace) != 0;
872 Escape escape1 = new Escape ();
873 Escape escape2 = new Escape ();
876 for (; idx1 < end1; idx1++)
877 if (!IsIgnorable (s1 [idx1], opt))
879 for (; idx2 < end2; idx2++)
880 if (!IsIgnorable (s2 [idx2], opt))
884 if (escape1.Source == null)
887 start1 = escape1.Start;
888 idx1 = escape1.Index;
890 quickCheckPos1 = escape1.Optional;
891 escape1.Source = null;
895 if (escape2.Source == null)
898 start2 = escape2.Start;
899 idx2 = escape2.Index;
901 quickCheckPos2 = escape2.Optional;
902 escape2.Source = null;
906 // If comparison is unstable, then this part is one of the most doubtful part.
907 // Here we run quick codepoint comparison and run back to "stable" area to
908 // compare characters.
910 // Strictly to say, even the first character
911 // could be compared here, but it messed
912 // backward step, so I just avoided mess.
913 if (quickCheckPos1 < idx1 && quickCheckPos2 < idx2) {
914 while (idx1 < end1 && idx2 < end2 &&
915 s1 [idx1] == s2 [idx2]) {
919 if (idx1 == end1 || idx2 == end2)
920 continue; // check replacement
922 int backwardEnd1 = quickCheckPos1;
923 int backwardEnd2 = quickCheckPos2;
924 quickCheckPos1 = idx1;
925 quickCheckPos2 = idx2;
930 for (;idx1 > backwardEnd1; idx1--)
931 if (Category (s1 [idx1]) != 1)
933 for (;idx2 > backwardEnd2; idx2--)
934 if (Category (s2 [idx2]) != 1)
936 for (;idx1 > backwardEnd1; idx1--)
937 if (IsSafe (s1 [idx1]))
939 for (;idx2 > backwardEnd2; idx2--)
940 if (IsSafe (s2 [idx2]))
949 int i1 = FilterOptions (s1 [idx1], opt);
950 int i2 = FilterOptions (s2 [idx2], opt);
951 bool special1 = false;
952 bool special2 = false;
954 // If current character is an expander, then
955 // repeat the previous character.
956 ext1 = GetExtenderType (i1);
957 if (ext1 != ExtenderType.None) {
958 if (ctx.PrevCode < 0) {
959 if (ctx.PrevSortKey == null) {
964 sk1 = ctx.PrevSortKey;
967 i1 = FilterExtender (ctx.PrevCode, ext1, opt);
969 ext2 = GetExtenderType (i2);
970 if (ext2 != ExtenderType.None) {
971 if (prev2.Code < 0) {
972 if (prev2.SortKey == null) {
980 i2 = FilterExtender (prev2.Code, ext2, opt);
983 byte cat1 = Category (i1);
984 byte cat2 = Category (i2);
986 // Handle special weight characters
987 if (!stringSort && currentLevel > 4) {
989 lv5At1 = escape1.Source != null ?
990 escape1.Index - escape1.Start :
992 // here Windows has a bug that it does
993 // not consider thirtiary weight.
994 lv5Value1 = Level1 (i1) << 8 + Uni.Level3 (i1);
999 lv5At2 = escape2.Source != null ?
1000 escape2.Index - escape2.Start :
1002 // here Windows has a bug that it does
1003 // not consider thirtiary weight.
1004 lv5Value2 = Level1 (i2) << 8 + Uni.Level3 (i2);
1008 if (cat1 == 6 || cat2 == 6) {
1014 Contraction ct1 = null;
1015 if (ext1 == ExtenderType.None)
1016 ct1 = GetContraction (s1, idx1, end1);
1021 else if (ct1 != null) {
1022 offset1 = ct1.Source.Length;
1023 if (ct1.SortKey != null) {
1025 for (int i = 0; i < ct1.SortKey.Length; i++)
1026 sk1 [i] = ct1.SortKey [i];
1028 ctx.PrevSortKey = sk1;
1030 else if (escape1.Source == null) {
1031 escape1.Source = s1;
1032 escape1.Start = start1;
1033 escape1.Index = cur1 + ct1.Source.Length;
1035 escape1.Optional = quickCheckPos1;
1036 s1 = ct1.Replacement;
1047 sk1 [1] = Level1 (i1);
1048 if (!ignoreNonSpace && currentLevel > 1)
1049 sk1 [2] = Level2 (i1, ext1);
1050 if (currentLevel > 2)
1051 sk1 [3] = Uni.Level3 (i1);
1052 if (currentLevel > 3)
1053 special1 = Uni.HasSpecialWeight ((char) i1);
1058 Contraction ct2 = null;
1059 if (ext2 == ExtenderType.None)
1060 ct2 = GetContraction (s2, idx2, end2);
1064 else if (ct2 != null) {
1065 idx2 += ct2.Source.Length;
1066 if (ct2.SortKey != null) {
1068 for (int i = 0; i < ct2.SortKey.Length; i++)
1069 sk2 [i] = ct2.SortKey [i];
1071 prev2.SortKey = sk2;
1073 else if (escape2.Source == null) {
1074 escape2.Source = s2;
1075 escape2.Start = start2;
1076 escape2.Index = cur2 + ct2.Source.Length;
1078 escape2.Optional = quickCheckPos2;
1079 s2 = ct2.Replacement;
1090 sk2 [1] = Level1 (i2);
1091 if (!ignoreNonSpace && currentLevel > 1)
1092 sk2 [2] = Level2 (i2, ext2);
1093 if (currentLevel > 2)
1094 sk2 [3] = Uni.Level3 (i2);
1095 if (currentLevel > 3)
1096 special2 = Uni.HasSpecialWeight ((char) i2);
1102 // Add offset here so that it does not skip
1103 // idx1 while s2 has a replacement.
1106 // add diacritical marks in s1 here
1107 if (!ignoreNonSpace) {
1108 while (idx1 < end1) {
1109 if (Category (s1 [idx1]) != 1)
1113 sk1 [2] = (byte) (sk1 [2] +
1114 Level2 (s1 [idx1], ExtenderType.None));
1118 // add diacritical marks in s2 here
1119 while (idx2 < end2) {
1120 if (Category (s2 [idx2]) != 1)
1124 sk2 [2] = (byte) (sk2 [2] +
1125 Level2 (s2 [idx2], ExtenderType.None));
1130 int ret = sk1 [0] - sk2 [0];
1131 ret = ret != 0 ? ret : sk1 [1] - sk2 [1];
1134 if (currentLevel == 1)
1136 if (!ignoreNonSpace) {
1137 ret = sk1 [2] - sk2 [2];
1140 if (immediateBreakup)
1141 return -1; // different
1142 currentLevel = frenchSort ? 2 : 1;
1146 if (currentLevel == 2)
1148 ret = sk1 [3] - sk2 [3];
1151 if (immediateBreakup)
1152 return -1; // different
1156 if (currentLevel == 3)
1158 if (special1 != special2) {
1159 if (immediateBreakup)
1160 return -1; // different
1161 finalResult = special1 ? 1 : -1;
1166 ret = CompareFlagPair (
1167 !Uni.IsJapaneseSmallLetter ((char) i1),
1168 !Uni.IsJapaneseSmallLetter ((char) i2));
1169 ret = ret != 0 ? ret :
1170 ToDashTypeValue (ext1, opt) -
1171 ToDashTypeValue (ext2, opt);
1172 ret = ret != 0 ? ret : CompareFlagPair (
1173 Uni.IsHiragana ((char) i1),
1174 Uni.IsHiragana ((char) i2));
1175 ret = ret != 0 ? ret : CompareFlagPair (
1176 !IsHalfKana ((char) i1, opt),
1177 !IsHalfKana ((char) i2, opt));
1179 if (immediateBreakup)
1180 return -1; // different
1188 // If there were only level 3 or lower differences,
1189 // then we still have to find diacritical differences
1191 if (!ignoreNonSpace &&
1192 finalResult != 0 && currentLevel > 2) {
1193 while (idx1 < end1 && idx2 < end2) {
1194 if (!Uni.IsIgnorableNonSpacing (s1 [idx1]))
1196 if (!Uni.IsIgnorableNonSpacing (s2 [idx2]))
1198 finalResult = Level2 (FilterOptions (s1 [idx1], opt), ext1) - Level2 (FilterOptions (s2 [idx2], opt), ext2);
1199 if (finalResult != 0)
1203 // they should work only for the first character
1204 ext1 = ExtenderType.None;
1205 ext2 = ExtenderType.None;
1208 if (currentLevel == 1 && finalResult != 0) {
1209 while (idx1 < end1) {
1210 if (Uni.IsIgnorableNonSpacing (s1 [idx1]))
1215 while (idx2 < end2) {
1216 if (Uni.IsIgnorableNonSpacing (s2 [idx2]))
1222 // we still have to handle level 5
1223 if (finalResult == 0) {
1224 finalResult = lv5At1 - lv5At2;
1225 if (finalResult == 0)
1226 finalResult = lv5Value1 - lv5Value2;
1228 if (finalResult == 0) {
1230 targetConsumed = true;
1232 sourceConsumed = true;
1234 return idx1 != end1 ? 1 : idx2 == end2 ? finalResult : -1;
1237 int CompareFlagPair (bool b1, bool b2)
1239 return b1 == b2 ? 0 : b1 ? 1 : -1;
1244 #region IsPrefix() and IsSuffix()
1246 public bool IsPrefix (string src, string target, CompareOptions opt)
1248 return IsPrefix (src, target, 0, src.Length, opt);
1251 public unsafe bool IsPrefix (string s, string target, int start, int length, CompareOptions opt)
1253 if (target.Length == 0)
1255 byte* sk1 = stackalloc byte [4];
1256 byte* sk2 = stackalloc byte [4];
1257 ClearBuffer (sk1, 4);
1258 ClearBuffer (sk2, 4);
1259 Context ctx = new Context (opt, null, null, sk1, sk2, null,
1260 QuickCheckPossible (s, start, start + length, target, 0, target.Length));
1261 return IsPrefix (s, target, start, length, true, ref ctx);
1264 unsafe bool IsPrefix (string s, string target, int start, int length, bool skipHeadingExtenders, ref Context ctx)
1266 bool consumed, dummy;
1267 CompareInternal (s, start, length,
1268 target, 0, target.Length,
1269 out consumed, out dummy, skipHeadingExtenders,
1276 public bool IsSuffix (string src, string target, CompareOptions opt)
1278 return IsSuffix (src, target, src.Length - 1, src.Length, opt);
1281 public unsafe bool IsSuffix (string s, string target, int start, int length, CompareOptions opt)
1283 if (target.Length == 0)
1285 int last = LastIndexOf (s, target, start, length, opt);
1286 return last >= 0 && Compare (s, last, s.Length - last, target, 0, target.Length, opt) == 0;
1288 // quick check : simple codepoint comparison
1289 if (s.Length >= target.Length) {
1291 int se = start - length;
1292 for (int i = target.Length - 1; si >= se && i >= 0; i--, si--)
1293 if (s [si] != target [i])
1295 if (si == start + target.Length)
1299 PreviousInfo prev = new PreviousInfo (false);
1300 byte* sk1 = stackalloc byte [4];
1301 byte* sk2 = stackalloc byte [4];
1302 ClearBuffer (sk1, 4);
1303 ClearBuffer (sk2, 4);
1304 return IsSuffix (opt, s, target, start, length, ref prev, sk1, sk2);
1308 unsafe bool IsSuffix (string s, string t, int start, int length, ref Context ctx)
1311 COpt opt = ctx.Option;
1313 for (;tstart < t.Length; tstart++)
1314 if (!IsIgnorable (t [tstart], opt))
1316 if (tstart == t.Length)
1317 return true; // as if target is String.Empty.
1320 // This is still not working. If it is not likely to get working, then just remove it.
1322 int send = start - length;
1323 int ti = t.Length - 1;
1326 int sStep = start + 1;
1327 int tStep = t.Length;
1328 bool sourceConsumed, targetConsumed;
1330 for (; send < si; si--)
1331 if (!IsIgnorable (s [si]))
1333 for (; tend < ti; ti--)
1334 if (!IsIgnorable (t [ti]))
1338 for (; send < si; si--)
1339 if (IsSafe (s [si]))
1341 for (; tend < ti; ti--)
1342 if (IsSafe (t [ti]))
1344 Console.WriteLine ("==== {0} {1} {2} {3} {4} {5} {6} {7} {8}", s, si, send, length, t, ti, tstart, sStep - si, tStep - ti);
1345 if (CompareInternal (opt, s, si, sStep - si,
1347 out targetConsumed, out sourceConsumed,
1359 // FIXME: it is not efficient for very long target.
1360 bool sourceConsumed, targetConsumed;
1361 int mismatchCount = 0;
1362 for (int i = 0; i < length; i++) {
1363 ctx.ClearPrevInfo ();
1365 int ret = CompareInternal (s, start - i, i + 1,
1366 t, tstart, t.Length - tstart,
1368 // FIXME: could immediately breakup
1369 out sourceConsumed, true, true, ref ctx);
1372 if (!sourceConsumed && targetConsumed)
1374 if (!targetConsumed) {
1376 if (mismatchCount > Uni.MaxExpansionLength)
1377 // The largest length of an
1378 // expansion is 3, so if the
1379 // target was not consumed more
1380 // than 3 times, then the tail
1381 // character does not match.
1391 #region IndexOf() / LastIndexOf()
1393 public int IndexOf (string s, string target, CompareOptions opt)
1395 return IndexOf (s, target, 0, s.Length, opt);
1398 public unsafe int IndexOf (string s, string target, int start, int length, CompareOptions opt)
1400 byte* alwaysMatchFlags = stackalloc byte [16];
1401 byte* neverMatchFlags = stackalloc byte [16];
1402 byte* targetSortKey = stackalloc byte [4];
1403 byte* sk1 = stackalloc byte [4];
1404 byte* sk2 = stackalloc byte [4];
1405 ClearBuffer (alwaysMatchFlags, 16);
1406 ClearBuffer (neverMatchFlags, 16);
1407 ClearBuffer (targetSortKey, 4);
1408 ClearBuffer (sk1, 4);
1409 ClearBuffer (sk2, 4);
1410 Context ctx = new Context (opt, alwaysMatchFlags, neverMatchFlags, sk1, sk2, null,
1411 QuickCheckPossible (s, start, start + length, target, 0, target.Length));
1413 return IndexOf (s, target, start, length,
1414 targetSortKey, ref ctx);
1417 public int IndexOf (string s, char target, CompareOptions opt)
1419 return IndexOf (s, target, 0, s.Length, opt);
1422 public unsafe int IndexOf (string s, char target, int start, int length, CompareOptions opt)
1424 byte* alwaysMatchFlags = stackalloc byte [16];
1425 byte* neverMatchFlags = stackalloc byte [16];
1426 byte* targetSortKey = stackalloc byte [4];
1427 byte* sk1 = stackalloc byte [4];
1428 byte* sk2 = stackalloc byte [4];
1429 ClearBuffer (alwaysMatchFlags, 16);
1430 ClearBuffer (neverMatchFlags, 16);
1431 ClearBuffer (targetSortKey, 4);
1432 ClearBuffer (sk1, 4);
1433 ClearBuffer (sk2, 4);
1434 Context ctx = new Context (opt, alwaysMatchFlags, neverMatchFlags, sk1, sk2, null, false);
1436 // If target is contraction, then use string search.
1437 Contraction ct = GetContraction (target);
1439 if (ct.Replacement != null)
1440 return IndexOf (s, ct.Replacement,
1441 start, length, targetSortKey, ref ctx);
1443 for (int i = 0; i < ct.SortKey.Length; i++)
1444 sk2 [i] = ct.SortKey [i];
1445 return IndexOfSortKey (s, start, length, sk2, char.MinValue, -1, true, ref ctx);
1448 int ti = FilterOptions ((int) target, opt);
1449 targetSortKey [0] = Category (ti);
1450 targetSortKey [1] = Level1 (ti);
1451 if ((opt & COpt.IgnoreNonSpace) == 0)
1453 Level2 (ti, ExtenderType.None);
1454 targetSortKey [3] = Uni.Level3 (ti);
1455 return IndexOfSortKey (s, start, length,
1456 targetSortKey, target, ti,
1457 !Uni.HasSpecialWeight ((char) ti), ref ctx);
1461 // Searches target byte[] keydata
1462 unsafe int IndexOfSortKey (string s, int start, int length, byte* sortkey, char target, int ti, bool noLv4, ref Context ctx)
1464 int end = start + length;
1469 if (MatchesForward (s, ref idx, end, ti, sortkey, noLv4, ref ctx))
1475 // Searches string. Search head character (or keydata when
1476 // the head is contraction sortkey) and try IsPrefix().
1477 unsafe int IndexOf (string s, string target, int start, int length, byte* targetSortKey, ref Context ctx)
1479 COpt opt = ctx.Option;
1481 for (; tidx < target.Length; tidx++)
1482 if (!IsIgnorable (target [tidx], opt))
1484 if (tidx == target.Length)
1486 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1487 string replace = ct != null ? ct.Replacement : null;
1488 byte* sk = replace == null ? targetSortKey : null;
1490 char tc = char.MinValue;
1492 if (ct != null && sk != null) {
1493 for (int i = 0; i < ct.SortKey.Length; i++)
1494 sk [i] = ct.SortKey [i];
1495 } else if (sk != null) {
1497 ti = FilterOptions (target [tidx], opt);
1498 sk [0] = Category (ti);
1499 sk [1] = Level1 (ti);
1500 if ((opt & COpt.IgnoreNonSpace) == 0)
1501 sk [2] = Level2 (ti, ExtenderType.None);
1502 sk [3] = Uni.Level3 (ti);
1503 noLv4 = !Uni.HasSpecialWeight ((char) ti);
1506 for (tidx++; tidx < target.Length; tidx++) {
1507 if (Category (target [tidx]) != 1)
1511 sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None));
1517 if (replace != null)
1518 idx = IndexOf (s, replace, start, length, targetSortKey, ref ctx);
1520 idx = IndexOfSortKey (s, start, length, sk, tc, ti, noLv4, ref ctx);
1523 length -= idx - start;
1525 if (IsPrefix (s, target, start, length, false, ref ctx))
1527 Contraction cts = GetContraction (s, start, length);
1529 start += cts.Source.Length;
1530 length -= cts.Source.Length;
1535 } while (length > 0);
1540 // There are the same number of IndexOf() related methods,
1541 // with the same functionalities for each.
1544 public int LastIndexOf (string s, string target, CompareOptions opt)
1546 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1549 public unsafe int LastIndexOf (string s, string target, int start, int length, CompareOptions opt)
1551 byte* alwaysMatchFlags = stackalloc byte [16];
1552 byte* neverMatchFlags = stackalloc byte [16];
1553 byte* targetSortKey = stackalloc byte [4];
1554 byte* sk1 = stackalloc byte [4];
1555 byte* sk2 = stackalloc byte [4];
1556 ClearBuffer (alwaysMatchFlags, 16);
1557 ClearBuffer (neverMatchFlags, 16);
1558 ClearBuffer (targetSortKey, 4);
1559 ClearBuffer (sk1, 4);
1560 ClearBuffer (sk2, 4);
1561 // For some unknown reason CompareQuick() does not work fine w/ LastIndexOf().
1562 Context ctx = new Context (opt, alwaysMatchFlags, neverMatchFlags, sk1, sk2, null, false);
1563 return LastIndexOf (s, target, start, length,
1564 targetSortKey, ref ctx);
1567 public int LastIndexOf (string s, char target, CompareOptions opt)
1569 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1572 public unsafe int LastIndexOf (string s, char target, int start, int length, CompareOptions opt)
1574 byte* alwaysMatchFlags = stackalloc byte [16];
1575 byte* neverMatchFlags = stackalloc byte [16];
1576 byte* targetSortKey = stackalloc byte [4];
1577 byte* sk1 = stackalloc byte [4];
1578 byte* sk2 = stackalloc byte [4];
1579 ClearBuffer (alwaysMatchFlags, 16);
1580 ClearBuffer (neverMatchFlags, 16);
1581 ClearBuffer (targetSortKey, 4);
1582 ClearBuffer (sk1, 4);
1583 ClearBuffer (sk2, 4);
1584 Context ctx = new Context (opt, alwaysMatchFlags, neverMatchFlags, sk1, sk2, null, false);
1586 // If target is a replacement contraction, then use
1588 Contraction ct = GetContraction (target);
1590 if (ct.Replacement != null)
1591 return LastIndexOf (s,
1592 ct.Replacement, start, length,
1593 targetSortKey, ref ctx);
1595 for (int bi = 0; bi < ct.SortKey.Length; bi++)
1596 sk2 [bi] = ct.SortKey [bi];
1597 return LastIndexOfSortKey (s, start,
1603 int ti = FilterOptions ((int) target, opt);
1604 targetSortKey [0] = Category (ti);
1605 targetSortKey [1] = Level1 (ti);
1606 if ((opt & COpt.IgnoreNonSpace) == 0)
1607 targetSortKey [2] = Level2 (ti, ExtenderType.None);
1608 targetSortKey [3] = Uni.Level3 (ti);
1609 return LastIndexOfSortKey (s, start, start,
1610 length, targetSortKey,
1611 ti, !Uni.HasSpecialWeight ((char) ti),
1616 // Searches target byte[] keydata
1617 unsafe int LastIndexOfSortKey (string s, int start, int orgStart, int length, byte* sortkey, int ti, bool noLv4, ref Context ctx)
1619 int end = start - length;
1623 if (MatchesBackward (s, ref idx, end, orgStart,
1624 ti, sortkey, noLv4, ref ctx))
1630 // Searches string. Search head character (or keydata when
1631 // the head is contraction sortkey) and try IsPrefix().
1632 unsafe int LastIndexOf (string s, string target, int start, int length, byte* targetSortKey, ref Context ctx)
1634 COpt opt = ctx.Option;
1635 int orgStart = start;
1637 for (; tidx < target.Length; tidx++)
1638 if (!IsIgnorable (target [tidx], opt))
1640 if (tidx == target.Length)
1642 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1643 string replace = ct != null ? ct.Replacement : null;
1644 byte* sk = replace == null ? targetSortKey : null;
1648 if (ct != null && sk != null) {
1649 for (int i = 0; i < ct.SortKey.Length; i++)
1650 sk [i] = ct.SortKey [i];
1651 } else if (sk != null) {
1652 ti = FilterOptions (target [tidx], opt);
1653 sk [0] = Category (ti);
1654 sk [1] = Level1 (ti);
1655 if ((opt & COpt.IgnoreNonSpace) == 0)
1656 sk [2] = Level2 (ti, ExtenderType.None);
1657 sk [3] = Uni.Level3 (ti);
1658 noLv4 = !Uni.HasSpecialWeight ((char) ti);
1661 for (tidx++; tidx < target.Length; tidx++) {
1662 if (Category (target [tidx]) != 1)
1666 sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None));
1673 if (replace != null)
1674 idx = LastIndexOf (s, replace,
1676 targetSortKey, ref ctx);
1678 idx = LastIndexOfSortKey (s, start, orgStart, length, sk, ti, noLv4, ref ctx);
1681 length -= start - idx;
1684 if (IsPrefix (s, target, idx, orgStart - idx + 1, false, ref ctx)) {
1685 for (;idx < orgStart; idx++)
1686 if (!IsIgnorable (s [idx], opt))
1690 Contraction cts = GetContraction (s, idx, orgStart - idx + 1);
1692 start -= cts.Source.Length;
1693 length -= cts.Source.Length;
1698 } while (length > 0);
1702 unsafe bool MatchesForward (string s, ref int idx, int end, int ti, byte* sortkey, bool noLv4, ref Context ctx)
1704 COpt opt = ctx.Option;
1706 if (ctx.AlwaysMatchFlags != null && si < 128 && (ctx.AlwaysMatchFlags [si / 8] & (1 << (si % 8))) != 0)
1708 if (ctx.NeverMatchFlags != null &&
1710 (ctx.NeverMatchFlags [si / 8] & (1 << (si % 8))) != 0) {
1714 ExtenderType ext = GetExtenderType (s [idx]);
1715 Contraction ct = null;
1716 if (MatchesForwardCore (s, ref idx, end, ti, sortkey, noLv4, ext, ref ct, ref ctx)) {
1717 if (ctx.AlwaysMatchFlags != null && ct == null && ext == ExtenderType.None && si < 128)
1718 ctx.AlwaysMatchFlags [si / 8] |= (byte) (1 << (si % 8));
1721 if (ctx.NeverMatchFlags != null && ct == null && ext == ExtenderType.None && si < 128)
1722 ctx.NeverMatchFlags [si / 8] |= (byte) (1 << (si % 8));
1726 unsafe bool MatchesForwardCore (string s, ref int idx, int end, int ti, byte* sortkey, bool noLv4, ExtenderType ext, ref Contraction ct, ref Context ctx)
1728 COpt opt = ctx.Option;
1729 byte* charSortKey = ctx.Buffer1;
1730 bool ignoreNonSpace = (opt & COpt.IgnoreNonSpace) != 0;
1732 if (ext == ExtenderType.None)
1733 ct = GetContraction (s, idx, end);
1734 else if (ctx.PrevCode < 0) {
1735 if (ctx.PrevSortKey == null) {
1739 charSortKey = ctx.PrevSortKey;
1742 si = FilterExtender (ctx.PrevCode, ext, opt);
1743 // if lv4 exists, it never matches contraction
1745 idx += ct.Source.Length;
1748 if (ct.SortKey != null) {
1749 for (int i = 0; i < 4; i++)
1750 charSortKey [i] = sortkey [i];
1752 ctx.PrevSortKey = charSortKey;
1754 // Here is the core of LAMESPEC
1755 // described at the top of the source.
1757 return MatchesForward (ct.Replacement, ref dummy,
1758 ct.Replacement.Length, ti, sortkey, noLv4, ref ctx);
1762 si = FilterOptions (s [idx], opt);
1764 charSortKey [0] = Category (si);
1765 bool noMatch = false;
1766 if (sortkey [0] == charSortKey [0])
1767 charSortKey [1] = Level1 (si);
1770 if (!ignoreNonSpace && sortkey [1] == charSortKey [1])
1771 charSortKey [2] = Level2 (si, ext);
1772 else if (!ignoreNonSpace)
1775 for (; idx < end; idx++) {
1776 if (Category (s [idx]) != 1)
1781 charSortKey [3] = Uni.Level3 (si);
1782 if (charSortKey [0] != 1)
1785 for (; idx < end; idx++) {
1786 if (Category (s [idx]) != 1)
1790 if (charSortKey [2] == 0)
1791 charSortKey [2] = 2;
1792 charSortKey [2] = (byte) (charSortKey [2]
1793 + Level2 (s [idx], ExtenderType.None));
1796 return MatchesPrimitive (opt, charSortKey, si, ext, sortkey, ti, noLv4);
1799 unsafe bool MatchesPrimitive (COpt opt, byte* source, int si, ExtenderType ext, byte* target, int ti, bool noLv4)
1801 bool ignoreNonSpace = (opt & COpt.IgnoreNonSpace) != 0;
1802 if (source [0] != target [0] ||
1803 source [1] != target [1] ||
1804 (!ignoreNonSpace && source [2] != target [2]) ||
1805 source [3] != target [3])
1807 if (noLv4 && (si < 0 || !Uni.HasSpecialWeight ((char) si)))
1811 // Since target can never be an extender, if the source
1812 // is an expander and it matters, then they never match.
1813 if (!ignoreNonSpace && ext == ExtenderType.Conditional)
1815 if (Uni.IsJapaneseSmallLetter ((char) si) !=
1816 Uni.IsJapaneseSmallLetter ((char) ti) ||
1817 ToDashTypeValue (ext, opt) !=
1818 // FIXME: we will have to specify correct value for target
1819 ToDashTypeValue (ExtenderType.None, opt) ||
1820 !Uni.IsHiragana ((char) si) !=
1821 !Uni.IsHiragana ((char) ti) ||
1822 IsHalfKana ((char) si, opt) !=
1823 IsHalfKana ((char) ti, opt))
1828 unsafe bool MatchesBackward (string s, ref int idx, int end, int orgStart, int ti, byte* sortkey, bool noLv4, ref Context ctx)
1831 if (ctx.AlwaysMatchFlags != null && si < 128 && (ctx.AlwaysMatchFlags [si / 8] & (1 << (si % 8))) != 0)
1833 if (ctx.NeverMatchFlags != null && si < 128 && (ctx.NeverMatchFlags [si / 8] & (1 << (si % 8))) != 0) {
1837 ExtenderType ext = GetExtenderType (s [idx]);
1838 Contraction ct = null;
1839 if (MatchesBackwardCore (s, ref idx, end, orgStart, ti, sortkey, noLv4, ext, ref ct, ref ctx)) {
1840 if (ctx.AlwaysMatchFlags != null && ct == null && ext == ExtenderType.None && si < 128)
1841 ctx.AlwaysMatchFlags [si / 8] |= (byte) (1 << (si % 8));
1844 if (ctx.NeverMatchFlags != null && ct == null && ext == ExtenderType.None && si < 128) {
1845 ctx.NeverMatchFlags [si / 8] |= (byte) (1 << (si % 8));
1850 unsafe bool MatchesBackwardCore (string s, ref int idx, int end, int orgStart, int ti, byte* sortkey, bool noLv4, ExtenderType ext, ref Contraction ct, ref Context ctx)
1852 COpt opt = ctx.Option;
1853 byte* charSortKey = ctx.Buffer1;
1854 bool ignoreNonSpace = (opt & COpt.IgnoreNonSpace) != 0;
1857 // To handle extenders in source, we need to
1858 // check next _primary_ character.
1859 if (ext != ExtenderType.None) {
1860 byte diacritical = 0;
1861 for (int tmp = 0; ; tmp--) {
1862 if (tmp < 0) // heading extender
1864 if (IsIgnorable (s [tmp], opt))
1866 int tmpi = FilterOptions (s [tmp], opt);
1867 byte category = Category (tmpi);
1868 if (category == 1) {
1869 diacritical = Level2 (tmpi, ExtenderType.None);
1872 si = FilterExtender (tmpi, ext, opt);
1873 charSortKey [0] = category;
1874 charSortKey [1] = Level1 (si);
1875 if (!ignoreNonSpace)
1876 charSortKey [2] = Level2 (si, ext);
1877 charSortKey [3] = Uni.Level3 (si);
1878 if (ext != ExtenderType.Conditional &&
1881 (charSortKey [2] == 0) ?
1882 (byte) (diacritical + 2) :
1888 if (ext == ExtenderType.None)
1889 ct = GetTailContraction (s, idx, end);
1890 // if lv4 exists, it never matches contraction
1892 idx -= ct.Source.Length;
1895 if (ct.SortKey != null) {
1896 for (int i = 0; i < 4; i++)
1897 charSortKey [i] = sortkey [i];
1899 ctx.PrevSortKey = charSortKey;
1901 // Here is the core of LAMESPEC
1902 // described at the top of the source.
1903 int dummy = ct.Replacement.Length - 1;
1904 return 0 <= LastIndexOfSortKey (
1905 ct.Replacement, dummy, dummy,
1906 ct.Replacement.Length, sortkey,
1907 ti, noLv4, ref ctx);
1909 } else if (ext == ExtenderType.None) {
1911 si = FilterOptions (s [idx], opt);
1913 bool noMatch = false;
1914 charSortKey [0] = Category (si);
1915 if (charSortKey [0] == sortkey [0])
1916 charSortKey [1] = Level1 (si);
1919 if (!ignoreNonSpace && charSortKey [1] == sortkey [1])
1920 charSortKey [2] = Level2 (si, ext);
1921 else if (!ignoreNonSpace)
1925 charSortKey [3] = Uni.Level3 (si);
1926 if (charSortKey [0] != 1)
1929 if (ext == ExtenderType.None) {
1930 for (int tmp = cur + 1; tmp < orgStart; tmp++) {
1931 if (Category (s [tmp]) != 1)
1935 if (charSortKey [2] == 0)
1936 charSortKey [2] = 2;
1937 charSortKey [2] = (byte) (charSortKey [2]
1938 + Level2 (s [tmp], ExtenderType.None));
1941 return MatchesPrimitive (opt, charSortKey, si, ext, sortkey, ti, noLv4);