2 // SimpleCollator.cs : the core collator implementation
5 // Atsushi Enomoto <atsushi@ximian.com>
7 // Copyright (C) 2005 Novell, Inc (http://www.novell.com)
9 // Permission is hereby granted, free of charge, to any person obtaining
10 // a copy of this software and associated documentation files (the
11 // "Software"), to deal in the Software without restriction, including
12 // without limitation the rights to use, copy, modify, merge, publish,
13 // distribute, sublicense, and/or sell copies of the Software, and to
14 // permit persons to whom the Software is furnished to do so, subject to
15 // the following conditions:
17 // The above copyright notice and this permission notice shall be
18 // included in all copies or substantial portions of the Software.
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 // Here's a summary for supporting contractions, expansions and diacritical
33 // Diacritical mapping is a simple tailoring rule that "remaps" diacritical
34 // weight value from one to another. For now it applies to all range of
35 // characters, but at some stage we might need to limit the remapping ranges.
37 // A Contraction consists of a string (actually char[]) and a sortkey entry
38 // (i.e. byte[]). It indicates that a sequence of characters is interpreted
39 // as a single character that has the mapped sortkey value. There is no
40 // character which goes across "special rules". When the collator encountered
41 // such a sequence of characters, it just returns the sortkey value without
42 // further replacement.
44 // Since it is still likely to happen that a contraction sequence matches
45 // other character than the identical sequence (depending on CompareOptions
46 // and of course, defined sortkey value itself), comparison cannot be done
47 // at source char[] level.
49 // In IndexOf(), the first character in the target string or the target char
50 // itself is turned into sortkey bytes. If the character has a contraction and
51 // that is sortkey map, then it is used instead. If the contraction exists and
52 // that is replacement map, then the first character of the replacement string
53 // is searched instead. IndexOf() always searches only for the top character,
54 // and if it returned negative value, then it returns -1. Otherwise, it then
55 // tries IsPrefix() from that location. If it returns true, then it returns
58 // LAMESPEC: IndexOf() is lame as a whole API. It never matches in the middle
59 // of expansion and there is no proper way to return such indexes within
60 // a single int return value.
62 // For example, try below in .NET:
63 // IndexOf("\u00E6", "a")
64 // IndexOf("\u00E6", "e")
69 using System.Collections;
70 using System.Globalization;
72 using Uni = Mono.Globalization.Unicode.MSCompatUnicodeTable;
73 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
74 using COpt = System.Globalization.CompareOptions;
76 namespace Mono.Globalization.Unicode
78 internal class SimpleCollator
80 // this environment variable is for debugging quick check.
81 #pragma warning disable 169, 414
82 static bool QuickCheckDisabled =
83 Environment.internalGetEnvironmentVariable (
84 "MONO_COLLATION_QUICK_CHECK_DISABLED") == "yes";
85 #pragma warning restore 169, 414
87 unsafe internal struct Context
89 public Context (CompareOptions opt, byte* alwaysMatchFlags, byte* neverMatchFlags, byte* buffer1, byte* buffer2, byte* prev1/*, bool quickCheckPossible*/)
92 AlwaysMatchFlags = alwaysMatchFlags;
93 NeverMatchFlags = neverMatchFlags;
98 // QuickCheckPossible = quickCheckPossible;
101 public readonly CompareOptions Option;
102 public readonly byte* NeverMatchFlags;
103 public readonly byte* AlwaysMatchFlags;
104 public byte* Buffer1;
105 public byte* Buffer2;
107 public byte* PrevSortKey;
108 // public readonly bool QuickCheckPossible;
110 public void ClearPrevInfo ()
117 unsafe struct PreviousInfo
120 public byte* SortKey;
122 public PreviousInfo (bool dummy)
131 public string Source;
138 static SimpleCollator invariant =
139 new SimpleCollator (CultureInfo.InvariantCulture);
141 readonly TextInfo textInfo; // for ToLower().
142 readonly CodePointIndexer cjkIndexer;
143 readonly Contraction [] contractions;
144 readonly Level2Map [] level2Maps;
145 // This flag marks characters as "unsafe", where the character
146 // could be used as part of a contraction (whose length > 1).
147 readonly byte [] unsafeFlags;
149 unsafe readonly byte* cjkCatTable;
150 unsafe readonly byte* cjkLv1Table;
151 unsafe readonly byte* cjkLv2Table;
152 readonly CodePointIndexer cjkLv2Indexer;
154 readonly bool frenchSort;
157 const int UnsafeFlagLength = 0x300 / 8;
159 // readonly byte [] contractionFlags = new byte [16];
161 // This array is internally used inside IndexOf() to store
162 // "no need to check" ASCII characters.
164 // Now that it should be thread safe, this array is allocated
166 // byte [] neverMatchFlags = new byte [128 / 8];
168 #region .ctor() and split functions
170 public SimpleCollator (CultureInfo culture)
173 textInfo = culture.TextInfo;
176 SetCJKTable (culture, ref cjkIndexer,
177 ref cjkCatTable, ref cjkLv1Table,
178 ref cjkLv2Indexer, ref cjkLv2Table);
181 // Get tailoring info
182 TailoringInfo t = null;
183 for (CultureInfo ci = culture; ci.LCID != 127; ci = ci.Parent) {
184 t = Uni.GetTailoringInfo (ci.LCID);
188 if (t == null) // then use invariant
189 t = Uni.GetTailoringInfo (127);
191 frenchSort = t.FrenchSort;
192 Uni.BuildTailoringTables (culture, t, ref contractions,
194 unsafeFlags = new byte [UnsafeFlagLength];
195 foreach (Contraction c in contractions) {
196 if (c.Source.Length > 1)
197 foreach (char ch in c.Source)
198 unsafeFlags [(int) ch / 8 ]
199 |= (byte) (1 << ((int) ch & 7));
200 // for (int i = 0; i < c.Source.Length; i++) {
201 // int ch = c.Source [i];
204 // contractionFlags [ch / 8] |= (byte) (1 << (ch & 7));
208 foreach (Contraction c in invariant.contractions) {
209 if (c.Source.Length > 1)
210 foreach (char ch in c.Source)
211 unsafeFlags [(int) ch / 8 ]
212 |= (byte) (1 << ((int) ch & 7));
213 // for (int i = 0; i < c.Source.Length; i++) {
214 // int ch = c.Source [i];
217 // contractionFlags [ch / 8] |= (byte) (1 << (ch & 7));
221 // FIXME: Since tailorings are mostly for latin
222 // (and in some cases Cyrillic) characters, it would
223 // be much better for performance to store "start
224 // indexes" for > 370 (culture-specific letters).
227 // dump tailoring table
228 Console.WriteLine ("******** building table for {0} : contractions - {1} diacritical - {2}",
229 culture.LCID, contractions.Length, level2Maps.Length);
230 foreach (Contraction c in contractions) {
231 foreach (char cc in c.Source)
232 Console.Write ("{0:X4} ", (int) cc);
233 Console.WriteLine (" -> '{0}'", c.Replacement);
238 unsafe private void SetCJKTable (
239 CultureInfo culture, ref CodePointIndexer cjkIndexer,
240 ref byte* catTable, ref byte* lv1Table,
241 ref CodePointIndexer lv2Indexer, ref byte* lv2Table)
243 string name = GetNeutralCulture (culture).Name;
245 Uni.FillCJK (name, ref cjkIndexer, ref catTable,
246 ref lv1Table, ref lv2Indexer, ref lv2Table);
249 static CultureInfo GetNeutralCulture (CultureInfo info)
251 CultureInfo ret = info;
252 while (ret.Parent != null && ret.Parent.LCID != 127)
259 unsafe byte Category (int cp)
261 if (cp < 0x3000 || cjkCatTable == null)
262 return Uni.Category (cp);
263 int idx = cjkIndexer.ToIndex (cp);
264 return idx < 0 ? Uni.Category (cp) : cjkCatTable [idx];
267 unsafe byte Level1 (int cp)
269 if (cp < 0x3000 || cjkLv1Table == null)
270 return Uni.Level1 (cp);
271 int idx = cjkIndexer.ToIndex (cp);
272 return idx < 0 ? Uni.Level1 (cp) : cjkLv1Table [idx];
275 unsafe byte Level2 (int cp, ExtenderType ext)
277 if (ext == ExtenderType.Buggy)
279 else if (ext == ExtenderType.Conditional)
282 if (cp < 0x3000 || cjkLv2Table == null)
283 return Uni.Level2 (cp);
284 int idx = cjkLv2Indexer.ToIndex (cp);
285 byte ret = idx < 0 ? (byte) 0 : cjkLv2Table [idx];
288 ret = Uni.Level2 (cp);
289 if (level2Maps.Length == 0)
291 for (int i = 0; i < level2Maps.Length; i++) {
292 if (level2Maps [i].Source == ret)
293 return level2Maps [i].Replace;
294 else if (level2Maps [i].Source > ret)
300 static bool IsHalfKana (int cp, COpt opt)
302 return (opt & COpt.IgnoreWidth) != 0 ||
303 Uni.IsHalfWidthKana ((char) cp);
306 Contraction GetContraction (string s, int start, int end)
308 // int si = s [start];
309 // if (si < 128 && (contractionFlags [si / 8] & (1 << (si & 7))) == 0)
311 Contraction c = GetContraction (s, start, end, contractions);
312 if (c != null || lcid == 127)
314 return GetContraction (s, start, end, invariant.contractions);
317 Contraction GetContraction (string s, int start, int end, Contraction [] clist)
319 for (int i = 0; i < clist.Length; i++) {
320 Contraction ct = clist [i];
321 int diff = ct.Source [0] - s [start];
323 return null; // it's already sorted
326 char [] chars = ct.Source;
327 if (end - start < chars.Length)
330 for (int n = 0; n < chars.Length; n++)
331 if (s [start + n] != chars [n]) {
341 Contraction GetTailContraction (string s, int start, int end)
343 // int si = s [end - 1];
344 // if (si < 128 && (contractionFlags [si / 8] & (1 << (si & 7))) == 0)
346 Contraction c = GetTailContraction (s, start, end, contractions);
347 if (c != null || lcid == 127)
349 return GetTailContraction (s, start, end, invariant.contractions);
352 Contraction GetTailContraction (string s, int start, int end, Contraction [] clist)
354 if (start == end || end < -1 || start >= s.Length || s.Length <= end + 1)
355 throw new SystemException (String.Format ("MONO internal error. Failed to get TailContraction. start = {0} end = {1} string = '{2}'", start, end, s));
356 for (int i = 0; i < clist.Length; i++) {
357 Contraction ct = clist [i];
358 char [] chars = ct.Source;
359 if (chars.Length > start - end)
361 if (chars [chars.Length - 1] != s [start])
365 for (int n = 0, spos = start - chars.Length + 1;
368 if (s [spos] != chars [n]) {
379 Contraction GetContraction (char c)
381 Contraction ct = GetContraction (c, contractions);
382 if (ct != null || lcid == 127)
384 return GetContraction (c, invariant.contractions);
387 Contraction GetContraction (char c, Contraction [] clist)
389 for (int i = 0; i < clist.Length; i++) {
390 Contraction ct = clist [i];
391 if (ct.Source [0] > c)
392 return null; // it's already sorted
393 if (ct.Source [0] == c && ct.Source.Length == 1)
399 int FilterOptions (int i, COpt opt)
401 if ((opt & COpt.IgnoreWidth) != 0) {
402 int x = Uni.ToWidthCompat (i);
406 if ((opt & COpt.OrdinalIgnoreCase) != 0)
407 i = textInfo.ToLower ((char) i);
408 if ((opt & COpt.IgnoreCase) != 0)
409 i = textInfo.ToLower ((char) i);
410 if ((opt & COpt.IgnoreKanaType) != 0)
411 i = Uni.ToKanaTypeInsensitive (i);
423 ExtenderType GetExtenderType (int i)
425 // LAMESPEC: Windows expects true for U+3005, but
426 // sometimes it does not represent to repeat just
428 // Windows also expects true for U+3031 and U+3032,
429 // but they should *never* repeat one character.
431 // U+2015 becomes an extender only when it is Japanese
433 return lcid == 16 ? ExtenderType.Conditional :
436 if (i < 0x3005 || i > 0xFF70)
437 return ExtenderType.None;
442 return ExtenderType.Simple;
444 return ExtenderType.Conditional;
447 return ExtenderType.Voiced;
451 return ExtenderType.None;
453 case 0x3005: // LAMESPEC: see above.
454 return ExtenderType.Buggy;
455 case 0x3031: // LAMESPEC: see above.
456 case 0x3032: // LAMESPEC: see above.
459 return ExtenderType.Simple;
462 return ExtenderType.Voiced;
464 return ExtenderType.Conditional;
466 return ExtenderType.None;
470 static byte ToDashTypeValue (ExtenderType ext, COpt opt)
472 if ((opt & COpt.IgnoreNonSpace) != 0) // LAMESPEC: huh, why?
475 case ExtenderType.None:
477 case ExtenderType.Conditional:
484 int FilterExtender (int i, ExtenderType ext, COpt opt)
486 if (ext == ExtenderType.Conditional &&
487 Uni.HasSpecialWeight ((char) i)) {
488 bool half = IsHalfKana ((char) i, opt);
489 bool katakana = !Uni.IsHiragana ((char) i);
490 switch (Level1 (i) & 7) {
492 return half ? 0xFF71 : katakana ? 0x30A2 : 0x3042;
494 return half ? 0xFF72 : katakana ? 0x30A4 : 0x3044;
496 return half ? 0xFF73 : katakana ? 0x30A6 : 0x3046;
498 return half ? 0xFF74 : katakana ? 0x30A8 : 0x3048;
500 return half ? 0xFF75 : katakana ? 0x30AA : 0x304A;
506 static bool IsIgnorable (int i, COpt opt)
508 return Uni.IsIgnorable (i, (byte) (1 +
509 ((opt & COpt.IgnoreSymbols) != 0 ? 2 : 0) +
510 ((opt & COpt.IgnoreNonSpace) != 0 ? 4 : 0)));
516 return i / 8 >= unsafeFlags.Length ? true : (unsafeFlags [i / 8] & (1 << (i % 8))) == 0;
521 public SortKey GetSortKey (string s)
523 return GetSortKey (s, CompareOptions.None);
526 public SortKey GetSortKey (string s, CompareOptions options)
528 return GetSortKey (s, 0, s.Length, options);
531 public SortKey GetSortKey (string s, int start, int length, CompareOptions options)
533 SortKeyBuffer buf = new SortKeyBuffer (lcid);
534 buf.Initialize (options, lcid, s, frenchSort);
535 int end = start + length;
536 GetSortKey (s, start, end, buf, options);
537 return buf.GetResultAndReset ();
540 unsafe void GetSortKey (string s, int start, int end,
541 SortKeyBuffer buf, CompareOptions opt)
543 byte* prevbuf = stackalloc byte [4];
544 ClearBuffer (prevbuf, 4);
545 Context ctx = new Context (opt, null, null, null, null, prevbuf);
547 for (int n = start; n < end; n++) {
550 ExtenderType ext = GetExtenderType (i);
551 if (ext != ExtenderType.None) {
552 i = FilterExtender (ctx.PrevCode, ext, opt);
554 FillSortKeyRaw (i, ext, buf, opt);
555 else if (ctx.PrevSortKey != null) {
556 byte* b = ctx.PrevSortKey;
560 b [2] != 1 ? b [2] : Level2 (i, ext),
561 b [3] != 1 ? b [3] : Uni.Level3 (i));
563 // otherwise do nothing.
564 // (if the extender is the first char
565 // in the string, then just ignore.)
569 if (IsIgnorable (i, opt))
571 i = FilterOptions (i, opt);
573 Contraction ct = GetContraction (s, n, end);
575 if (ct.Replacement != null) {
576 GetSortKey (ct.Replacement, 0, ct.Replacement.Length, buf, opt);
578 byte* b = ctx.PrevSortKey;
579 for (int bi = 0; bi < ct.SortKey.Length; bi++)
580 b [bi] = ct.SortKey [bi];
584 b [2] != 1 ? b [2] : Level2 (i, ext),
585 b [3] != 1 ? b [3] : Uni.Level3 (i));
588 n += ct.Source.Length - 1;
591 if (!Uni.IsIgnorableNonSpacing (i))
593 FillSortKeyRaw (i, ExtenderType.None, buf, opt);
598 void FillSortKeyRaw (int i, ExtenderType ext,
599 SortKeyBuffer buf, CompareOptions opt)
601 if (0x3400 <= i && i <= 0x4DB5) {
602 int diff = i - 0x3400;
603 buf.AppendCJKExtension (
604 (byte) (0x10 + diff / 254),
605 (byte) (diff % 254 + 2));
609 UnicodeCategory uc = char.GetUnicodeCategory ((char) i);
611 case UnicodeCategory.PrivateUse:
612 int diff = i - 0xE000;
614 (byte) (0xE5 + diff / 254),
615 (byte) (diff % 254 + 2),
619 case UnicodeCategory.Surrogate:
620 FillSurrogateSortKeyRaw (i, buf);
624 byte level2 = Level2 (i, ext);
625 if (Uni.HasSpecialWeight ((char) i)) {
626 byte level1 = Level1 (i);
632 Uni.IsJapaneseSmallLetter ((char) i),
633 ToDashTypeValue (ext, opt),
634 !Uni.IsHiragana ((char) i),
635 IsHalfKana ((char) i, opt)
637 if ((opt & COpt.IgnoreNonSpace) == 0 && ext == ExtenderType.Voiced)
638 // Append voice weight
639 buf.AppendNormal (1, 1, 1, 0);
649 void FillSurrogateSortKeyRaw (int i, SortKeyBuffer buf)
658 lower = (byte) ((i == 0xD800) ? 0x3E : 0x3F);
659 } else if (0xD840 <= i && i < 0xD880) {
663 } else if (0xDB80 <= i && i < 0xDC00) {
664 diffbase = 0xDB80 - 0x40;
668 diffbase = 0xDC00 - 0xF8 + 2;
672 int diff = i - diffbase;
675 (byte) (segment + diff / 254),
676 (byte) (diff % 254 + 2),
685 public int Compare (string s1, string s2)
687 return Compare (s1, 0, s1.Length, s2, 0, s2.Length, CompareOptions.None);
690 private int CompareOrdinal (string s1, int idx1, int len1,
691 string s2, int idx2, int len2)
693 int min = len1 < len2 ? len1 : len2;
694 int end1 = idx1 + min;
695 int end2 = idx2 + min;
696 if (idx1 < 0 || idx2 < 0 || end1 > s1.Length || end2 > s2.Length)
697 throw new SystemException (String.Format ("CompareInfo Internal Error: Should not happen. {0} {1} {2} {3} {4} {5}", idx1, idx2, len1, len2, s1.Length, s2.Length));
698 for (int i1 = idx1, i2 = idx2;
699 i1 < end1 && i2 < end2; i1++, i2++)
700 if (s1 [i1] != s2 [i2])
701 return s1 [i1] - s2 [i2];
702 return len1 == len2 ? 0 :
703 len1 == min ? - 1 : 1;
706 // mostly equivalent to CompareOrdinal, but the return value is
707 // not based on codepoints.
708 private int CompareQuick (string s1, int idx1, int len1,
709 string s2, int idx2, int len2, out bool sourceConsumed,
710 out bool targetConsumed, bool immediateBreakup)
712 sourceConsumed = false;
713 targetConsumed = false;
714 int min = len1 < len2 ? len1 : len2;
715 int end1 = idx1 + min;
716 int end2 = idx2 + min;
717 if (idx1 < 0 || idx2 < 0 || end1 > s1.Length || end2 > s2.Length)
718 throw new SystemException (String.Format ("CompareInfo Internal Error: Should not happen. {0} {1} {2} {3} {4} {5}", idx1, idx2, len1, len2, s1.Length, s2.Length));
719 for (int i1 = idx1, i2 = idx2;
720 i1 < end1 && i2 < end2; i1++, i2++)
721 if (s1 [i1] != s2 [i2]) {
722 if (immediateBreakup)
724 int ret = Category (s1 [i1]) - Category (s2 [i2]);
726 ret = Level1 (s1 [i1]) - Level1 (s2 [i2]);
729 ret = Uni.Level3 (s1 [i1]) - Uni.Level3 (s2 [i2]);
731 throw new SystemException (String.Format ("CompareInfo Internal Error: Should not happen. '{0}' {2} {3} '{1}' {4} {5}", s1, s2, idx1, end1, idx2, end2));
734 sourceConsumed = len1 <= len2;
735 targetConsumed = len1 >= len2;
736 return len1 == len2 ? 0 :
737 len1 == min ? - 1 : 1;
740 private int CompareOrdinalIgnoreCase (string s1, int idx1, int len1,
741 string s2, int idx2, int len2)
743 int min = len1 < len2 ? len1 : len2;
744 int end1 = idx1 + min;
745 int end2 = idx2 + min;
746 if (idx1 < 0 || idx2 < 0 || end1 > s1.Length || end2 > s2.Length)
747 throw new SystemException (String.Format ("CompareInfo Internal Error: Should not happen. {0} {1} {2} {3} {4} {5}", idx1, idx2, len1, len2, s1.Length, s2.Length));
748 TextInfo ti = invariant.textInfo;
749 for (int i1 = idx1, i2 = idx2;
750 i1 < end1 && i2 < end2; i1++, i2++)
751 if (ti.ToLower (s1 [i1]) != ti.ToLower (s2 [i2]))
752 return ti.ToLower (s1 [i1]) - ti.ToLower (s2 [i2]);
753 return len1 == len2 ? 0 :
754 len1 == min ? - 1 : 1;
757 internal unsafe int Compare (string s1, int idx1, int len1,
758 string s2, int idx2, int len2, CompareOptions options)
760 #if false // stable easy version, depends on GetSortKey().
761 SortKey sk1 = GetSortKey (s1, idx1, len1, options);
762 SortKey sk2 = GetSortKey (s2, idx2, len2, options);
763 byte [] d1 = sk1.KeyData;
764 byte [] d2 = sk2.KeyData;
765 int len = d1.Length > d2.Length ? d2.Length : d1.Length;
766 for (int i = 0; i < len; i++)
767 if (d1 [i] != d2 [i])
768 return d1 [i] < d2 [i] ? -1 : 1;
769 return d1.Length == d2.Length ? 0 : d1.Length < d2.Length ? -1 : 1;
771 byte* sk1 = stackalloc byte [4];
772 byte* sk2 = stackalloc byte [4];
773 ClearBuffer (sk1, 4);
774 ClearBuffer (sk2, 4);
775 Context ctx = new Context (options, null, null, sk1, sk2, null);
776 // QuickCheckPossible (s1, idx1, idx1 + len1, s2, idx2, idx2 + len2));
779 int ret = CompareInternal (s1, idx1, len1, s2, idx2, len2, out dummy, out dummy2, true, false, ref ctx);
780 return ret == 0 ? 0 : ret < 0 ? -1 : 1;
784 unsafe void ClearBuffer (byte* buffer, int size)
786 for (int i = 0; i < size; i++)
791 bool QuickCheckPossible (string s1, int idx1, int end1,
792 string s2, int idx2, int end2)
795 // looks like it is buggy and inefficient, so just disable it.
798 if (QuickCheckDisabled)
800 // if (s1.Length > 100 || s2.Length > 100)
802 for (int i = idx1; i < end1; i++)
803 if (s1 [i] < 0x20 && (s1 [i] < '\x9' || s1 [i] > '\xD') || s1 [i] >= 0x80 || s1 [i] == '-' || s1 [i] == '\'')
805 for (int i = idx2; i < end2; i++)
806 if (s2 [i] < 0x20 && (s2 [i] < '\x9' || s2 [i] > '\xD') || s2 [i] >= 0x80 || s2 [i] == '-' || s2 [i] == '\'')
813 unsafe int CompareInternal (string s1, int idx1, int len1, string s2,
815 out bool targetConsumed, out bool sourceConsumed,
816 bool skipHeadingExtenders, bool immediateBreakup,
819 COpt opt = ctx.Option;
822 int end1 = idx1 + len1;
823 int end2 = idx2 + len2;
824 targetConsumed = false;
825 sourceConsumed = false;
826 PreviousInfo prev2 = new PreviousInfo (false);
828 // if (opt == CompareOptions.None && ctx.QuickCheckPossible)
829 // return CompareQuick (s1, idx1, len1, s2, idx2, len2, out sourceConsumed, out targetConsumed, immediateBreakup);
831 // It holds final result that comes from the comparison
832 // at level 2 or lower. Even if Compare() found the
833 // difference at level 2 or lower, it still has to
834 // continue level 1 comparison. FinalResult is used
835 // when there was no further differences.
837 // It holds the comparison level to do. It starts from
838 // 5, and becomes 1 when we do primary-only comparison.
839 int currentLevel = 5;
846 // Skip heading extenders
847 if (skipHeadingExtenders) {
848 for (; idx1 < end1; idx1++)
849 if (GetExtenderType (s1 [idx1]) == ExtenderType.None)
851 for (; idx2 < end2; idx2++)
852 if (GetExtenderType (s2 [idx2]) == ExtenderType.None)
856 ExtenderType ext1 = ExtenderType.None;
857 ExtenderType ext2 = ExtenderType.None;
859 int quickCheckPos1 = idx1;
860 int quickCheckPos2 = idx2;
861 bool stringSort = (opt & COpt.StringSort) != 0;
862 bool ignoreNonSpace = (opt & COpt.IgnoreNonSpace) != 0;
863 Escape escape1 = new Escape ();
864 Escape escape2 = new Escape ();
867 for (; idx1 < end1; idx1++)
868 if (!IsIgnorable (s1 [idx1], opt))
870 for (; idx2 < end2; idx2++)
871 if (!IsIgnorable (s2 [idx2], opt))
875 if (escape1.Source == null)
878 start1 = escape1.Start;
879 idx1 = escape1.Index;
881 quickCheckPos1 = escape1.Optional;
882 escape1.Source = null;
886 if (escape2.Source == null)
889 start2 = escape2.Start;
890 idx2 = escape2.Index;
892 quickCheckPos2 = escape2.Optional;
893 escape2.Source = null;
897 // If comparison is unstable, then this part is one of the most doubtful part.
898 // Here we run quick codepoint comparison and run back to "stable" area to
899 // compare characters.
901 // Strictly to say, even the first character
902 // could be compared here, but it messed
903 // backward step, so I just avoided mess.
904 if (quickCheckPos1 < idx1 && quickCheckPos2 < idx2) {
905 while (idx1 < end1 && idx2 < end2 &&
906 s1 [idx1] == s2 [idx2]) {
910 if (idx1 == end1 || idx2 == end2)
911 continue; // check replacement
913 int backwardEnd1 = quickCheckPos1;
914 int backwardEnd2 = quickCheckPos2;
915 quickCheckPos1 = idx1;
916 quickCheckPos2 = idx2;
921 for (;idx1 > backwardEnd1; idx1--)
922 if (Category (s1 [idx1]) != 1)
924 for (;idx2 > backwardEnd2; idx2--)
925 if (Category (s2 [idx2]) != 1)
927 for (;idx1 > backwardEnd1; idx1--)
928 if (IsSafe (s1 [idx1]))
930 for (;idx2 > backwardEnd2; idx2--)
931 if (IsSafe (s2 [idx2]))
940 int i1 = FilterOptions (s1 [idx1], opt);
941 int i2 = FilterOptions (s2 [idx2], opt);
942 bool special1 = false;
943 bool special2 = false;
945 // If current character is an expander, then
946 // repeat the previous character.
947 ext1 = GetExtenderType (i1);
948 if (ext1 != ExtenderType.None) {
949 if (ctx.PrevCode < 0) {
950 if (ctx.PrevSortKey == null) {
955 sk1 = ctx.PrevSortKey;
958 i1 = FilterExtender (ctx.PrevCode, ext1, opt);
960 ext2 = GetExtenderType (i2);
961 if (ext2 != ExtenderType.None) {
962 if (prev2.Code < 0) {
963 if (prev2.SortKey == null) {
971 i2 = FilterExtender (prev2.Code, ext2, opt);
974 byte cat1 = Category (i1);
975 byte cat2 = Category (i2);
977 // Handle special weight characters
979 if (!stringSort && currentLevel == 5) {
980 lv5At1 = escape1.Source != null ?
981 escape1.Index - escape1.Start :
983 // here Windows has a bug that it does
984 // not consider thirtiary weight.
985 lv5Value1 = Level1 (i1) << 8 + Uni.Level3 (i1);
991 if (!stringSort && currentLevel == 5) {
992 lv5At2 = escape2.Source != null ?
993 escape2.Index - escape2.Start :
995 // here Windows has a bug that it does
996 // not consider thirtiary weight.
997 lv5Value2 = Level1 (i2) << 8 + Uni.Level3 (i2);
1002 if (cat1 == 6 || cat2 == 6) {
1003 if (currentLevel == 5) {
1004 if (lv5Value1 == lv5Value2) {
1005 // There is not really difference here.
1006 lv5At1 = lv5At2 = -1;
1007 lv5Value1 = lv5Value2 = 0;
1015 Contraction ct1 = null;
1016 if (ext1 == ExtenderType.None)
1017 ct1 = GetContraction (s1, idx1, end1);
1022 else if (ct1 != null) {
1023 offset1 = ct1.Source.Length;
1024 if (ct1.SortKey != null) {
1026 for (int i = 0; i < ct1.SortKey.Length; i++)
1027 sk1 [i] = ct1.SortKey [i];
1029 ctx.PrevSortKey = sk1;
1031 else if (escape1.Source == null) {
1032 escape1.Source = s1;
1033 escape1.Start = start1;
1034 escape1.Index = cur1 + ct1.Source.Length;
1036 escape1.Optional = quickCheckPos1;
1037 s1 = ct1.Replacement;
1048 sk1 [1] = Level1 (i1);
1049 if (!ignoreNonSpace && currentLevel > 1)
1050 sk1 [2] = Level2 (i1, ext1);
1051 if (currentLevel > 2)
1052 sk1 [3] = Uni.Level3 (i1);
1053 if (currentLevel > 3)
1054 special1 = Uni.HasSpecialWeight ((char) i1);
1059 Contraction ct2 = null;
1060 if (ext2 == ExtenderType.None)
1061 ct2 = GetContraction (s2, idx2, end2);
1065 else if (ct2 != null) {
1066 idx2 += ct2.Source.Length;
1067 if (ct2.SortKey != null) {
1069 for (int i = 0; i < ct2.SortKey.Length; i++)
1070 sk2 [i] = ct2.SortKey [i];
1072 prev2.SortKey = sk2;
1074 else if (escape2.Source == null) {
1075 escape2.Source = s2;
1076 escape2.Start = start2;
1077 escape2.Index = cur2 + ct2.Source.Length;
1079 escape2.Optional = quickCheckPos2;
1080 s2 = ct2.Replacement;
1091 sk2 [1] = Level1 (i2);
1092 if (!ignoreNonSpace && currentLevel > 1)
1093 sk2 [2] = Level2 (i2, ext2);
1094 if (currentLevel > 2)
1095 sk2 [3] = Uni.Level3 (i2);
1096 if (currentLevel > 3)
1097 special2 = Uni.HasSpecialWeight ((char) i2);
1103 // Add offset here so that it does not skip
1104 // idx1 while s2 has a replacement.
1107 if (!ignoreNonSpace) {
1108 // add diacritical marks in s1 here
1109 while (idx1 < end1) {
1110 if (Category (s1 [idx1]) != 1)
1114 sk1 [2] = (byte) (sk1 [2] +
1115 Level2 (s1 [idx1], ExtenderType.None));
1119 // add diacritical marks in s2 here
1120 while (idx2 < end2) {
1121 if (Category (s2 [idx2]) != 1)
1125 sk2 [2] = (byte) (sk2 [2] +
1126 Level2 (s2 [idx2], ExtenderType.None));
1131 int ret = sk1 [0] - sk2 [0];
1132 ret = ret != 0 ? ret : sk1 [1] - sk2 [1];
1135 if (currentLevel == 1)
1137 if (!ignoreNonSpace) {
1138 ret = sk1 [2] - sk2 [2];
1141 if (immediateBreakup)
1142 return -1; // different
1143 currentLevel = frenchSort ? 2 : 1;
1147 if (currentLevel == 2)
1149 ret = sk1 [3] - sk2 [3];
1152 if (immediateBreakup)
1153 return -1; // different
1157 if (currentLevel == 3)
1159 if (special1 != special2) {
1160 if (immediateBreakup)
1161 return -1; // different
1162 finalResult = special1 ? 1 : -1;
1167 ret = CompareFlagPair (
1168 !Uni.IsJapaneseSmallLetter ((char) i1),
1169 !Uni.IsJapaneseSmallLetter ((char) i2));
1170 ret = ret != 0 ? ret :
1171 ToDashTypeValue (ext1, opt) -
1172 ToDashTypeValue (ext2, opt);
1173 ret = ret != 0 ? ret : CompareFlagPair (
1174 Uni.IsHiragana ((char) i1),
1175 Uni.IsHiragana ((char) i2));
1176 ret = ret != 0 ? ret : CompareFlagPair (
1177 !IsHalfKana ((char) i1, opt),
1178 !IsHalfKana ((char) i2, opt));
1180 if (immediateBreakup)
1181 return -1; // different
1189 // If there were only level 3 or lower differences,
1190 // then we still have to find diacritical differences
1192 if (!ignoreNonSpace &&
1193 finalResult != 0 && currentLevel > 2) {
1194 while (idx1 < end1 && idx2 < end2) {
1195 if (!Uni.IsIgnorableNonSpacing (s1 [idx1]))
1197 if (!Uni.IsIgnorableNonSpacing (s2 [idx2]))
1199 finalResult = Level2 (FilterOptions (s1 [idx1], opt), ext1) - Level2 (FilterOptions (s2 [idx2], opt), ext2);
1200 if (finalResult != 0)
1204 // they should work only for the first character
1205 ext1 = ExtenderType.None;
1206 ext2 = ExtenderType.None;
1209 if (currentLevel == 1 && finalResult != 0) {
1210 while (idx1 < end1) {
1211 if (Uni.IsIgnorableNonSpacing (s1 [idx1]))
1216 while (idx2 < end2) {
1217 if (Uni.IsIgnorableNonSpacing (s2 [idx2]))
1223 // we still have to check level 5
1224 if (finalResult == 0) {
1225 if (lv5At1 < 0 && lv5At2 >= 0)
1227 else if (lv5At2 < 0 && lv5At1 >= 0)
1230 finalResult = lv5At1 - lv5At2;
1231 if (finalResult == 0)
1232 finalResult = lv5Value1 - lv5Value2;
1235 if (finalResult == 0) {
1237 targetConsumed = true;
1239 sourceConsumed = true;
1241 return idx1 != end1 ? 1 : idx2 == end2 ? finalResult : -1;
1244 int CompareFlagPair (bool b1, bool b2)
1246 return b1 == b2 ? 0 : b1 ? 1 : -1;
1251 #region IsPrefix() and IsSuffix()
1253 public bool IsPrefix (string src, string target, CompareOptions opt)
1255 return IsPrefix (src, target, 0, src.Length, opt);
1258 public unsafe bool IsPrefix (string s, string target, int start, int length, CompareOptions opt)
1260 if (target.Length == 0)
1262 byte* sk1 = stackalloc byte [4];
1263 byte* sk2 = stackalloc byte [4];
1264 ClearBuffer (sk1, 4);
1265 ClearBuffer (sk2, 4);
1266 Context ctx = new Context (opt, null, null, sk1, sk2, null);
1267 //QuickCheckPossible (s, start, start + length, target, 0, target.Length));
1268 return IsPrefix (s, target, start, length, true, ref ctx);
1271 unsafe bool IsPrefix (string s, string target, int start, int length, bool skipHeadingExtenders, ref Context ctx)
1273 bool consumed, dummy;
1274 CompareInternal (s, start, length,
1275 target, 0, target.Length,
1276 out consumed, out dummy, skipHeadingExtenders,
1283 public bool IsSuffix (string src, string target, CompareOptions opt)
1285 return IsSuffix (src, target, src.Length - 1, src.Length, opt);
1288 public unsafe bool IsSuffix (string s, string target, int start, int length, CompareOptions opt)
1290 if (target.Length == 0)
1292 int last = LastIndexOf (s, target, start, length, opt);
1293 return last >= 0 && Compare (s, last, s.Length - last, target, 0, target.Length, opt) == 0;
1295 // quick check : simple codepoint comparison
1296 if (s.Length >= target.Length) {
1298 int se = start - length;
1299 for (int i = target.Length - 1; si >= se && i >= 0; i--, si--)
1300 if (s [si] != target [i])
1302 if (si == start + target.Length)
1306 PreviousInfo prev = new PreviousInfo (false);
1307 byte* sk1 = stackalloc byte [4];
1308 byte* sk2 = stackalloc byte [4];
1309 ClearBuffer (sk1, 4);
1310 ClearBuffer (sk2, 4);
1311 return IsSuffix (opt, s, target, start, length, ref prev, sk1, sk2);
1315 unsafe bool IsSuffix (string s, string t, int start, int length, ref Context ctx)
1318 COpt opt = ctx.Option;
1320 for (;tstart < t.Length; tstart++)
1321 if (!IsIgnorable (t [tstart], opt))
1323 if (tstart == t.Length)
1324 return true; // as if target is String.Empty.
1327 // This is still not working. If it is not likely to get working, then just remove it.
1329 int send = start - length;
1330 int ti = t.Length - 1;
1333 int sStep = start + 1;
1334 int tStep = t.Length;
1335 bool sourceConsumed, targetConsumed;
1337 for (; send < si; si--)
1338 if (!IsIgnorable (s [si]))
1340 for (; tend < ti; ti--)
1341 if (!IsIgnorable (t [ti]))
1345 for (; send < si; si--)
1346 if (IsSafe (s [si]))
1348 for (; tend < ti; ti--)
1349 if (IsSafe (t [ti]))
1351 Console.WriteLine ("==== {0} {1} {2} {3} {4} {5} {6} {7} {8}", s, si, send, length, t, ti, tstart, sStep - si, tStep - ti);
1352 if (CompareInternal (opt, s, si, sStep - si,
1354 out targetConsumed, out sourceConsumed,
1366 // FIXME: it is not efficient for very long target.
1367 bool sourceConsumed, targetConsumed;
1368 int mismatchCount = 0;
1369 for (int i = 0; i < length; i++) {
1370 ctx.ClearPrevInfo ();
1372 int ret = CompareInternal (s, start - i, i + 1,
1373 t, tstart, t.Length - tstart,
1375 // FIXME: could immediately breakup
1376 out sourceConsumed, true, true, ref ctx);
1379 if (!sourceConsumed && targetConsumed)
1381 if (!targetConsumed) {
1383 if (mismatchCount > Uni.MaxExpansionLength)
1384 // The largest length of an
1385 // expansion is 3, so if the
1386 // target was not consumed more
1387 // than 3 times, then the tail
1388 // character does not match.
1398 #region IndexOf() / LastIndexOf()
1402 public int IndexOf (string s, string target, CompareOptions opt)
1404 return IndexOf (s, target, 0, s.Length, opt);
1407 int QuickIndexOf (string s, string target, int start, int length, out bool testWasUnable)
1409 int testedSourcePos = -1, testedTargetPos = -1;
1411 testWasUnable = true;
1412 if (target.Length == 0)
1414 else if (target.Length > length)
1416 testWasUnable = false;
1418 int end = start + length - target.Length + 1;
1419 for (int i = start; i < end; i++) {
1421 for (int j = 0; j < target.Length; j++) {
1422 if (testedTargetPos < j) {
1423 char c = target [j];
1424 if (c == 0 || c >= 0x80) {
1425 testWasUnable = true;
1429 testedTargetPos = j;
1431 if (testedSourcePos < i + j) {
1433 if (c == 0 || c >= 0x80) {
1434 testWasUnable = true;
1438 testedSourcePos = i + j;
1440 if (s [i + j] != target [j]) {
1452 public unsafe int IndexOf (string s, string target, int start, int length, CompareOptions opt)
1454 if (opt == CompareOptions.Ordinal)
1455 throw new NotSupportedException ("Should not be reached");
1456 if (opt == CompareOptions.OrdinalIgnoreCase)
1457 throw new NotSupportedException ("Should not be reached");
1458 if (opt == CompareOptions.None) {
1460 int ret = QuickIndexOf (s, target, start, length, out testWasUnable);
1465 byte* alwaysMatchFlags = stackalloc byte [16];
1466 byte* neverMatchFlags = stackalloc byte [16];
1467 byte* targetSortKey = stackalloc byte [4];
1468 byte* sk1 = stackalloc byte [4];
1469 byte* sk2 = stackalloc byte [4];
1470 ClearBuffer (alwaysMatchFlags, 16);
1471 ClearBuffer (neverMatchFlags, 16);
1472 ClearBuffer (targetSortKey, 4);
1473 ClearBuffer (sk1, 4);
1474 ClearBuffer (sk2, 4);
1475 Context ctx = new Context (opt, alwaysMatchFlags, neverMatchFlags, sk1, sk2, null);
1477 return IndexOf (s, target, start, length,
1478 targetSortKey, ref ctx);
1481 // note that it wouldn't be used since ordinal IndexOf()
1482 // should be directed to icall.
1483 int IndexOfOrdinal (string s, string target, int start, int length)
1485 if (target.Length == 0)
1487 else if (target.Length > length)
1490 int end = start + length - target.Length + 1;
1491 for (int i = start; i < end; i++) {
1493 for (int j = 0; j < target.Length; j++) {
1494 if (s [i + j] != target [j]) {
1508 public int IndexOf (string s, char target, CompareOptions opt)
1510 return IndexOf (s, target, 0, s.Length, opt);
1513 public unsafe int IndexOf (string s, char target, int start, int length, CompareOptions opt)
1515 if (opt == CompareOptions.Ordinal)
1516 throw new NotSupportedException ("Should not be reached");
1517 if (opt == CompareOptions.OrdinalIgnoreCase)
1518 throw new NotSupportedException ("Should not be reached");
1519 byte* alwaysMatchFlags = stackalloc byte [16];
1520 byte* neverMatchFlags = stackalloc byte [16];
1521 byte* targetSortKey = stackalloc byte [4];
1522 byte* sk1 = stackalloc byte [4];
1523 byte* sk2 = stackalloc byte [4];
1524 ClearBuffer (alwaysMatchFlags, 16);
1525 ClearBuffer (neverMatchFlags, 16);
1526 ClearBuffer (targetSortKey, 4);
1527 ClearBuffer (sk1, 4);
1528 ClearBuffer (sk2, 4);
1529 Context ctx = new Context (opt, alwaysMatchFlags, neverMatchFlags, sk1, sk2, null);
1531 // If target is contraction, then use string search.
1532 Contraction ct = GetContraction (target);
1534 if (ct.Replacement != null)
1535 return IndexOf (s, ct.Replacement,
1536 start, length, targetSortKey, ref ctx);
1538 for (int i = 0; i < ct.SortKey.Length; i++)
1539 sk2 [i] = ct.SortKey [i];
1540 return IndexOfSortKey (s, start, length, sk2, char.MinValue, -1, true, ref ctx);
1543 int ti = FilterOptions ((int) target, opt);
1544 targetSortKey [0] = Category (ti);
1545 targetSortKey [1] = Level1 (ti);
1546 if ((opt & COpt.IgnoreNonSpace) == 0)
1548 Level2 (ti, ExtenderType.None);
1549 targetSortKey [3] = Uni.Level3 (ti);
1550 return IndexOfSortKey (s, start, length,
1551 targetSortKey, target, ti,
1552 !Uni.HasSpecialWeight ((char) ti), ref ctx);
1556 // note that it wouldn't be used since ordinal IndexOf()
1557 // should be directed to icall.
1558 int IndexOfOrdinal (string s, char target, int start, int length)
1560 int end = start + length;
1561 for (int i = start; i < end; i++)
1562 if (s [i] == target)
1567 // Searches target byte[] keydata
1568 unsafe int IndexOfSortKey (string s, int start, int length, byte* sortkey, char target, int ti, bool noLv4, ref Context ctx)
1570 int end = start + length;
1575 if (MatchesForward (s, ref idx, end, ti, sortkey, noLv4, ref ctx))
1581 // Searches string. Search head character (or keydata when
1582 // the head is contraction sortkey) and try IsPrefix().
1583 unsafe int IndexOf (string s, string target, int start, int length, byte* targetSortKey, ref Context ctx)
1585 COpt opt = ctx.Option;
1587 for (; tidx < target.Length; tidx++)
1588 if (!IsIgnorable (target [tidx], opt))
1590 if (tidx == target.Length)
1591 // FIXME: this is likely a hack. A string that is consists of \0 differs from those of other ignorable characters.
1592 return IndexOfOrdinal (target, '\0', 0, target.Length) >= 0 ? IndexOfOrdinal (s, target, start, length) : start;
1593 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1594 string replace = ct != null ? ct.Replacement : null;
1595 byte* sk = replace == null ? targetSortKey : null;
1597 char tc = char.MinValue;
1599 if (ct != null && sk != null) {
1600 for (int i = 0; i < ct.SortKey.Length; i++)
1601 sk [i] = ct.SortKey [i];
1602 } else if (sk != null) {
1604 ti = FilterOptions (target [tidx], opt);
1605 sk [0] = Category (ti);
1606 sk [1] = Level1 (ti);
1607 if ((opt & COpt.IgnoreNonSpace) == 0)
1608 sk [2] = Level2 (ti, ExtenderType.None);
1609 sk [3] = Uni.Level3 (ti);
1610 noLv4 = !Uni.HasSpecialWeight ((char) ti);
1613 for (tidx++; tidx < target.Length; tidx++) {
1614 if (Category (target [tidx]) != 1)
1618 sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None));
1624 if (replace != null)
1625 idx = IndexOf (s, replace, start, length, targetSortKey, ref ctx);
1627 idx = IndexOfSortKey (s, start, length, sk, tc, ti, noLv4, ref ctx);
1630 length -= idx - start;
1632 if (IsPrefix (s, target, start, length, false, ref ctx))
1634 Contraction cts = GetContraction (s, start, length);
1636 start += cts.Source.Length;
1637 length -= cts.Source.Length;
1642 } while (length > 0);
1647 // There are the same number of LastIndexOf() methods as
1648 // IndexOf() related methods, with the same functionalities
1654 public int LastIndexOf (string s, string target, CompareOptions opt)
1656 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1659 public unsafe int LastIndexOf (string s, string target, int start, int length, CompareOptions opt)
1661 if (opt == CompareOptions.Ordinal)
1662 return LastIndexOfOrdinal (s, target, start, length);
1663 if (opt == CompareOptions.OrdinalIgnoreCase)
1664 throw new NotSupportedException ("Should not be reached");
1665 byte* alwaysMatchFlags = stackalloc byte [16];
1666 byte* neverMatchFlags = stackalloc byte [16];
1667 byte* targetSortKey = stackalloc byte [4];
1668 byte* sk1 = stackalloc byte [4];
1669 byte* sk2 = stackalloc byte [4];
1670 ClearBuffer (alwaysMatchFlags, 16);
1671 ClearBuffer (neverMatchFlags, 16);
1672 ClearBuffer (targetSortKey, 4);
1673 ClearBuffer (sk1, 4);
1674 ClearBuffer (sk2, 4);
1675 // For some unknown reason CompareQuick() does not work fine w/ LastIndexOf().
1676 Context ctx = new Context (opt, alwaysMatchFlags, neverMatchFlags, sk1, sk2, null);
1677 return LastIndexOf (s, target, start, length,
1678 targetSortKey, ref ctx);
1681 int LastIndexOfOrdinal (string s, string target, int start, int length)
1683 if (target.Length == 0)
1685 if (s.Length < target.Length || target.Length > length)
1687 int end = start - length + target.Length -1;
1688 char tail = target [target.Length - 1];
1689 for (int i = start; i > end;) {
1690 if (s [i] != tail) {
1694 int x = i - target.Length + 1;
1696 bool mismatch = false;
1697 for (int j = target.Length - 2; j >= 0; j--)
1698 if (s [x + j] != target [j]) {
1711 public int LastIndexOf (string s, char target, CompareOptions opt)
1713 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1716 public unsafe int LastIndexOf (string s, char target, int start, int length, CompareOptions opt)
1718 if (opt == CompareOptions.Ordinal)
1719 throw new NotSupportedException ();
1720 if (opt == CompareOptions.OrdinalIgnoreCase)
1721 throw new NotSupportedException ();
1722 byte* alwaysMatchFlags = stackalloc byte [16];
1723 byte* neverMatchFlags = stackalloc byte [16];
1724 byte* targetSortKey = stackalloc byte [4];
1725 byte* sk1 = stackalloc byte [4];
1726 byte* sk2 = stackalloc byte [4];
1727 ClearBuffer (alwaysMatchFlags, 16);
1728 ClearBuffer (neverMatchFlags, 16);
1729 ClearBuffer (targetSortKey, 4);
1730 ClearBuffer (sk1, 4);
1731 ClearBuffer (sk2, 4);
1732 Context ctx = new Context (opt, alwaysMatchFlags, neverMatchFlags, sk1, sk2, null);
1734 // If target is a replacement contraction, then use
1736 Contraction ct = GetContraction (target);
1738 if (ct.Replacement != null)
1739 return LastIndexOf (s,
1740 ct.Replacement, start, length,
1741 targetSortKey, ref ctx);
1743 for (int bi = 0; bi < ct.SortKey.Length; bi++)
1744 sk2 [bi] = ct.SortKey [bi];
1745 return LastIndexOfSortKey (s, start,
1751 int ti = FilterOptions ((int) target, opt);
1752 targetSortKey [0] = Category (ti);
1753 targetSortKey [1] = Level1 (ti);
1754 if ((opt & COpt.IgnoreNonSpace) == 0)
1755 targetSortKey [2] = Level2 (ti, ExtenderType.None);
1756 targetSortKey [3] = Uni.Level3 (ti);
1757 return LastIndexOfSortKey (s, start, start,
1758 length, targetSortKey,
1759 ti, !Uni.HasSpecialWeight ((char) ti),
1764 // Searches target byte[] keydata
1765 unsafe int LastIndexOfSortKey (string s, int start, int orgStart, int length, byte* sortkey, int ti, bool noLv4, ref Context ctx)
1767 int end = start - length;
1771 if (MatchesBackward (s, ref idx, end, orgStart,
1772 ti, sortkey, noLv4, ref ctx))
1778 // Searches string. Search head character (or keydata when
1779 // the head is contraction sortkey) and try IsPrefix().
1780 unsafe int LastIndexOf (string s, string target, int start, int length, byte* targetSortKey, ref Context ctx)
1782 COpt opt = ctx.Option;
1783 int orgStart = start;
1785 for (; tidx < target.Length; tidx++)
1786 if (!IsIgnorable (target [tidx], opt))
1788 if (tidx == target.Length)
1789 // FIXME: this is likely a hack. A string that is consists of \0 differs from those of other ignorable characters.
1790 return IndexOfOrdinal (target, '\0', 0, target.Length) >= 0 ? LastIndexOfOrdinal (s, target, start, length) : start;
1791 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1792 string replace = ct != null ? ct.Replacement : null;
1793 byte* sk = replace == null ? targetSortKey : null;
1797 if (ct != null && sk != null) {
1798 for (int i = 0; i < ct.SortKey.Length; i++)
1799 sk [i] = ct.SortKey [i];
1800 } else if (sk != null) {
1801 ti = FilterOptions (target [tidx], opt);
1802 sk [0] = Category (ti);
1803 sk [1] = Level1 (ti);
1804 if ((opt & COpt.IgnoreNonSpace) == 0)
1805 sk [2] = Level2 (ti, ExtenderType.None);
1806 sk [3] = Uni.Level3 (ti);
1807 noLv4 = !Uni.HasSpecialWeight ((char) ti);
1810 for (tidx++; tidx < target.Length; tidx++) {
1811 if (Category (target [tidx]) != 1)
1815 sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None));
1822 if (replace != null)
1823 idx = LastIndexOf (s, replace,
1825 targetSortKey, ref ctx);
1827 idx = LastIndexOfSortKey (s, start, orgStart, length, sk, ti, noLv4, ref ctx);
1830 length -= start - idx;
1833 if (IsPrefix (s, target, idx, orgStart - idx + 1, false, ref ctx)) {
1834 for (;idx < orgStart; idx++)
1835 if (!IsIgnorable (s [idx], opt))
1839 Contraction cts = GetContraction (s, idx, orgStart - idx + 1);
1841 start -= cts.Source.Length;
1842 length -= cts.Source.Length;
1847 } while (length > 0);
1851 unsafe bool MatchesForward (string s, ref int idx, int end, int ti, byte* sortkey, bool noLv4, ref Context ctx)
1854 if (ctx.AlwaysMatchFlags != null && si < 128 && (ctx.AlwaysMatchFlags [si / 8] & (1 << (si % 8))) != 0)
1856 if (ctx.NeverMatchFlags != null &&
1858 (ctx.NeverMatchFlags [si / 8] & (1 << (si % 8))) != 0) {
1862 ExtenderType ext = GetExtenderType (s [idx]);
1863 Contraction ct = null;
1864 if (MatchesForwardCore (s, ref idx, end, ti, sortkey, noLv4, ext, ref ct, ref ctx)) {
1865 if (ctx.AlwaysMatchFlags != null && ct == null && ext == ExtenderType.None && si < 128)
1866 ctx.AlwaysMatchFlags [si / 8] |= (byte) (1 << (si % 8));
1869 if (ctx.NeverMatchFlags != null && ct == null && ext == ExtenderType.None && si < 128)
1870 ctx.NeverMatchFlags [si / 8] |= (byte) (1 << (si % 8));
1874 unsafe bool MatchesForwardCore (string s, ref int idx, int end, int ti, byte* sortkey, bool noLv4, ExtenderType ext, ref Contraction ct, ref Context ctx)
1876 COpt opt = ctx.Option;
1877 byte* charSortKey = ctx.Buffer1;
1878 bool ignoreNonSpace = (opt & COpt.IgnoreNonSpace) != 0;
1880 if (ext == ExtenderType.None)
1881 ct = GetContraction (s, idx, end);
1882 else if (ctx.PrevCode < 0) {
1883 if (ctx.PrevSortKey == null) {
1887 charSortKey = ctx.PrevSortKey;
1890 si = FilterExtender (ctx.PrevCode, ext, opt);
1891 // if lv4 exists, it never matches contraction
1893 idx += ct.Source.Length;
1896 if (ct.SortKey != null) {
1897 for (int i = 0; i < 4; i++)
1898 charSortKey [i] = sortkey [i];
1900 ctx.PrevSortKey = charSortKey;
1902 // Here is the core of LAMESPEC
1903 // described at the top of the source.
1905 return MatchesForward (ct.Replacement, ref dummy,
1906 ct.Replacement.Length, ti, sortkey, noLv4, ref ctx);
1910 si = FilterOptions (s [idx], opt);
1912 charSortKey [0] = Category (si);
1913 bool noMatch = false;
1914 if (sortkey [0] == charSortKey [0])
1915 charSortKey [1] = Level1 (si);
1918 if (!ignoreNonSpace && sortkey [1] == charSortKey [1])
1919 charSortKey [2] = Level2 (si, ext);
1920 else if (!ignoreNonSpace)
1923 for (; idx < end; idx++) {
1924 if (Category (s [idx]) != 1)
1929 charSortKey [3] = Uni.Level3 (si);
1930 if (charSortKey [0] != 1)
1933 for (; idx < end; idx++) {
1934 if (Category (s [idx]) != 1)
1938 if (charSortKey [2] == 0)
1939 charSortKey [2] = 2;
1940 charSortKey [2] = (byte) (charSortKey [2]
1941 + Level2 (s [idx], ExtenderType.None));
1944 return MatchesPrimitive (opt, charSortKey, si, ext, sortkey, ti, noLv4);
1947 unsafe bool MatchesPrimitive (COpt opt, byte* source, int si, ExtenderType ext, byte* target, int ti, bool noLv4)
1949 bool ignoreNonSpace = (opt & COpt.IgnoreNonSpace) != 0;
1950 if (source [0] != target [0] ||
1951 source [1] != target [1] ||
1952 (!ignoreNonSpace && source [2] != target [2]) ||
1953 source [3] != target [3])
1955 if (noLv4 && (si < 0 || !Uni.HasSpecialWeight ((char) si)))
1959 // Since target can never be an extender, if the source
1960 // is an expander and it matters, then they never match.
1961 if (!ignoreNonSpace && ext == ExtenderType.Conditional)
1963 if (Uni.IsJapaneseSmallLetter ((char) si) !=
1964 Uni.IsJapaneseSmallLetter ((char) ti) ||
1965 ToDashTypeValue (ext, opt) !=
1966 // FIXME: we will have to specify correct value for target
1967 ToDashTypeValue (ExtenderType.None, opt) ||
1968 !Uni.IsHiragana ((char) si) !=
1969 !Uni.IsHiragana ((char) ti) ||
1970 IsHalfKana ((char) si, opt) !=
1971 IsHalfKana ((char) ti, opt))
1976 unsafe bool MatchesBackward (string s, ref int idx, int end, int orgStart, int ti, byte* sortkey, bool noLv4, ref Context ctx)
1979 if (ctx.AlwaysMatchFlags != null && si < 128 && (ctx.AlwaysMatchFlags [si / 8] & (1 << (si % 8))) != 0)
1981 if (ctx.NeverMatchFlags != null && si < 128 && (ctx.NeverMatchFlags [si / 8] & (1 << (si % 8))) != 0) {
1985 ExtenderType ext = GetExtenderType (s [idx]);
1986 Contraction ct = null;
1987 if (MatchesBackwardCore (s, ref idx, end, orgStart, ti, sortkey, noLv4, ext, ref ct, ref ctx)) {
1988 if (ctx.AlwaysMatchFlags != null && ct == null && ext == ExtenderType.None && si < 128)
1989 ctx.AlwaysMatchFlags [si / 8] |= (byte) (1 << (si % 8));
1992 if (ctx.NeverMatchFlags != null && ct == null && ext == ExtenderType.None && si < 128) {
1993 ctx.NeverMatchFlags [si / 8] |= (byte) (1 << (si % 8));
1998 unsafe bool MatchesBackwardCore (string s, ref int idx, int end, int orgStart, int ti, byte* sortkey, bool noLv4, ExtenderType ext, ref Contraction ct, ref Context ctx)
2000 COpt opt = ctx.Option;
2001 byte* charSortKey = ctx.Buffer1;
2002 bool ignoreNonSpace = (opt & COpt.IgnoreNonSpace) != 0;
2005 // To handle extenders in source, we need to
2006 // check next _primary_ character.
2007 if (ext != ExtenderType.None) {
2008 byte diacritical = 0;
2009 for (int tmp = idx; ; tmp--) {
2010 if (tmp < 0) // heading extender
2012 if (IsIgnorable (s [tmp], opt))
2014 int tmpi = FilterOptions (s [tmp], opt);
2015 byte category = Category (tmpi);
2016 if (category == 1) {
2017 diacritical = Level2 (tmpi, ExtenderType.None);
2020 si = FilterExtender (tmpi, ext, opt);
2021 charSortKey [0] = category;
2022 charSortKey [1] = Level1 (si);
2023 if (!ignoreNonSpace)
2024 charSortKey [2] = Level2 (si, ext);
2025 charSortKey [3] = Uni.Level3 (si);
2026 if (ext != ExtenderType.Conditional &&
2029 (charSortKey [2] == 0) ?
2030 (byte) (diacritical + 2) :
2036 if (ext == ExtenderType.None)
2037 ct = GetTailContraction (s, idx, end);
2038 // if lv4 exists, it never matches contraction
2040 idx -= ct.Source.Length;
2043 if (ct.SortKey != null) {
2044 for (int i = 0; i < 4; i++)
2045 charSortKey [i] = sortkey [i];
2047 ctx.PrevSortKey = charSortKey;
2049 // Here is the core of LAMESPEC
2050 // described at the top of the source.
2051 int dummy = ct.Replacement.Length - 1;
2052 return 0 <= LastIndexOfSortKey (
2053 ct.Replacement, dummy, dummy,
2054 ct.Replacement.Length, sortkey,
2055 ti, noLv4, ref ctx);
2057 } else if (ext == ExtenderType.None) {
2059 si = FilterOptions (s [idx], opt);
2061 bool noMatch = false;
2062 charSortKey [0] = Category (si);
2063 if (charSortKey [0] == sortkey [0])
2064 charSortKey [1] = Level1 (si);
2067 if (!ignoreNonSpace && charSortKey [1] == sortkey [1])
2068 charSortKey [2] = Level2 (si, ext);
2069 else if (!ignoreNonSpace)
2073 charSortKey [3] = Uni.Level3 (si);
2074 if (charSortKey [0] != 1)
2077 if (ext == ExtenderType.None) {
2078 for (int tmp = cur + 1; tmp < orgStart; tmp++) {
2079 if (Category (s [tmp]) != 1)
2083 if (charSortKey [2] == 0)
2084 charSortKey [2] = 2;
2085 charSortKey [2] = (byte) (charSortKey [2]
2086 + Level2 (s [tmp], ExtenderType.None));
2089 return MatchesPrimitive (opt, charSortKey, si, ext, sortkey, ti, noLv4);