2 // SimpleCollator.cs : the core collator implementation
5 // Atsushi Enomoto <atsushi@ximian.com>
7 // Copyright (C) 2005 Novell, Inc (http://www.novell.com)
9 // Permission is hereby granted, free of charge, to any person obtaining
10 // a copy of this software and associated documentation files (the
11 // "Software"), to deal in the Software without restriction, including
12 // without limitation the rights to use, copy, modify, merge, publish,
13 // distribute, sublicense, and/or sell copies of the Software, and to
14 // permit persons to whom the Software is furnished to do so, subject to
15 // the following conditions:
17 // The above copyright notice and this permission notice shall be
18 // included in all copies or substantial portions of the Software.
20 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
23 // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
24 // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
25 // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
26 // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
30 // Here's a summary for supporting contractions, expansions and diacritical
33 // Diacritical mapping is a simple tailoring rule that "remaps" diacritical
34 // weight value from one to another. For now it applies to all range of
35 // characters, but at some stage we might need to limit the remapping ranges.
37 // A Contraction consists of a string (actually char[]) and a sortkey entry
38 // (i.e. byte[]). It indicates that a sequence of characters is interpreted
39 // as a single character that has the mapped sortkey value. There is no
40 // character which goes across "special rules". When the collator encountered
41 // such a sequence of characters, it just returns the sortkey value without
42 // further replacement.
44 // Since it is still likely to happen that a contraction sequence matches
45 // other character than the identical sequence (depending on CompareOptions
46 // and of course, defined sortkey value itself), comparison cannot be done
47 // at source char[] level.
49 // In IndexOf(), the first character in the target string or the target char
50 // itself is turned into sortkey bytes. If the character has a contraction and
51 // that is sortkey map, then it is used instead. If the contraction exists and
52 // that is replacement map, then the first character of the replacement string
53 // is searched instead. IndexOf() always searches only for the top character,
54 // and if it returned negative value, then it returns -1. Otherwise, it then
55 // tries IsPrefix() from that location. If it returns true, then it returns
58 // LAMESPEC: IndexOf() is lame as a whole API. It never matches in the middle
59 // of expansion and there is no proper way to return such indexes within
60 // a single int return value.
62 // For example, try below in .NET:
63 // IndexOf("\u00E6", "a")
64 // IndexOf("\u00E6", "e")
69 using System.Collections;
70 using System.Globalization;
72 using Uni = Mono.Globalization.Unicode.MSCompatUnicodeTable;
73 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
75 namespace Mono.Globalization.Unicode
77 internal class SimpleCollator
79 static SimpleCollator invariant =
80 new SimpleCollator (CultureInfo.InvariantCulture);
83 // CompareOptions expanded.
84 bool ignoreNonSpace; // used in IndexOf()
89 TextInfo textInfo; // for ToLower().
91 unsafe readonly byte* cjkCatTable;
92 unsafe readonly byte* cjkLv1Table;
93 readonly CodePointIndexer cjkIndexer;
94 unsafe readonly byte* cjkLv2Table;
95 readonly CodePointIndexer cjkLv2Indexer;
97 readonly Contraction [] contractions;
98 readonly Level2Map [] level2Maps;
100 // This flag marks characters as "unsafe", where the character
101 // could be used as part of a contraction (whose length > 1).
102 readonly byte [] unsafeFlags;
104 const int UnsafeFlagLength = 0x300 / 8;
106 // temporary sortkey buffer for index search/comparison
107 byte [] charSortKey = new byte [4];
108 byte [] charSortKey2 = new byte [4];
109 byte [] charSortKeyIndexTarget = new byte [4];
111 #region .ctor() and split functions
113 public SimpleCollator (CultureInfo culture)
116 textInfo = culture.TextInfo;
117 buf = new SortKeyBuffer (culture.LCID);
120 SetCJKTable (culture, ref cjkIndexer,
121 ref cjkCatTable, ref cjkLv1Table,
122 ref cjkLv2Indexer, ref cjkLv2Table);
125 // Get tailoring info
126 TailoringInfo t = null;
127 for (CultureInfo ci = culture; ci.LCID != 127; ci = ci.Parent) {
128 t = Uni.GetTailoringInfo (ci.LCID);
132 if (t == null) // then use invariant
133 t = Uni.GetTailoringInfo (127);
135 frenchSort = t.FrenchSort;
136 Uni.BuildTailoringTables (culture, t, ref contractions,
138 unsafeFlags = new byte [UnsafeFlagLength];
139 foreach (Contraction c in contractions)
140 if (c.Source.Length > 1)
141 foreach (char ch in c.Source)
142 unsafeFlags [(int) ch / 8 ]
143 |= (byte) ((int) ch % 8);
145 foreach (Contraction c in invariant.contractions)
146 if (c.Source.Length > 1)
147 foreach (char ch in c.Source)
148 unsafeFlags [(int) ch / 8 ]
149 |= (byte) ((int) ch % 8);
151 // FIXME: Since tailorings are mostly for latin
152 // (and in some cases Cyrillic) characters, it would
153 // be much better for performance to store "start
154 // indexes" for > 370 (culture-specific letters).
157 // dump tailoring table
158 Console.WriteLine ("******** building table for {0} : c - {1} d - {2}",
159 culture.LCID, contractions.Length, level2Maps.Length);
160 foreach (Contraction c in contractions) {
161 foreach (char cc in c.Source)
162 Console.Write ("{0:X4} ", (int) cc);
163 Console.WriteLine (" -> '{0}'", c.Replacement);
168 unsafe private void SetCJKTable (
169 CultureInfo culture, ref CodePointIndexer cjkIndexer,
170 ref byte* catTable, ref byte* lv1Table,
171 ref CodePointIndexer lv2Indexer, ref byte* lv2Table)
173 string name = GetNeutralCulture (culture).Name;
175 Uni.FillCJK (name, ref cjkIndexer, ref catTable,
176 ref lv1Table, ref lv2Indexer, ref lv2Table);
179 static CultureInfo GetNeutralCulture (CultureInfo info)
181 CultureInfo ret = info;
182 while (ret.Parent != null && ret.Parent.LCID != 127)
189 unsafe byte Category (int cp)
191 if (cp < 0x3000 || cjkCatTable == null)
192 return Uni.Category (cp);
193 int idx = cjkIndexer.ToIndex (cp);
194 return idx < 0 ? Uni.Category (cp) : cjkCatTable [idx];
197 unsafe byte Level1 (int cp)
199 if (cp < 0x3000 || cjkLv1Table == null)
200 return Uni.Level1 (cp);
201 int idx = cjkIndexer.ToIndex (cp);
202 return idx < 0 ? Uni.Level1 (cp) : cjkLv1Table [idx];
205 unsafe byte Level2 (int cp, ExtenderType ext)
207 if (ext == ExtenderType.Buggy)
209 else if (ext == ExtenderType.Conditional)
212 if (cp < 0x3000 || cjkLv2Table == null)
213 return Uni.Level2 (cp);
214 int idx = cjkLv2Indexer.ToIndex (cp);
215 byte ret = idx < 0 ? (byte) 0 : cjkLv2Table [idx];
218 ret = Uni.Level2 (cp);
219 if (level2Maps.Length == 0)
221 for (int i = 0; i < level2Maps.Length; i++) {
222 if (level2Maps [i].Source == ret)
223 return level2Maps [i].Replace;
224 else if (level2Maps [i].Source > ret)
230 bool IsHalfKana (int cp)
232 return ignoreWidth || Uni.IsHalfWidthKana ((char) cp);
235 void SetOptions (CompareOptions options)
237 this.ignoreNonSpace = (options & CompareOptions.IgnoreNonSpace) != 0;
238 this.ignoreSymbols = (options & CompareOptions.IgnoreSymbols) != 0;
239 this.ignoreWidth = (options & CompareOptions.IgnoreWidth) != 0;
240 this.ignoreCase = (options & CompareOptions.IgnoreCase) != 0;
241 this.ignoreKanaType = (options & CompareOptions.IgnoreKanaType) != 0;
242 previousChar = previousChar2 = -1;
243 previousSortKey = previousSortKey2 = null;
244 escape1.Source = escape2.Source = null;
247 Contraction GetContraction (string s, int start, int end)
249 Contraction c = GetContraction (s, start, end, contractions);
250 if (c != null || lcid == 127)
252 return GetContraction (s, start, end, invariant.contractions);
255 Contraction GetContraction (string s, int start, int end, Contraction [] clist)
257 for (int i = 0; i < clist.Length; i++) {
258 Contraction ct = clist [i];
259 int diff = ct.Source [0] - s [start];
261 return null; // it's already sorted
264 char [] chars = ct.Source;
265 if (end - start < chars.Length)
268 for (int n = 0; n < chars.Length; n++)
269 if (s [start + n] != chars [n]) {
279 Contraction GetTailContraction (string s, int start, int end)
281 Contraction c = GetTailContraction (s, start, end, contractions);
282 if (c != null || lcid == 127)
284 return GetTailContraction (s, start, end, invariant.contractions);
287 Contraction GetTailContraction (string s, int start, int end, Contraction [] clist)
289 for (int i = 0; i < clist.Length; i++) {
290 Contraction ct = clist [i];
291 int diff = ct.Source [0] - s [end];
293 return null; // it's already sorted
296 char [] chars = ct.Source;
297 if (start - end + 1 < chars.Length)
300 int offset = start - chars.Length + 1;
301 for (int n = 0; n < chars.Length; n++)
302 if (s [offset + n] != chars [n]) {
312 Contraction GetContraction (char c)
314 Contraction ct = GetContraction (c, contractions);
315 if (ct != null || lcid == 127)
317 return GetContraction (c, invariant.contractions);
320 Contraction GetContraction (char c, Contraction [] clist)
322 for (int i = 0; i < clist.Length; i++) {
323 Contraction ct = clist [i];
324 if (ct.Source [0] > c)
325 return null; // it's already sorted
326 if (ct.Source [0] == c && ct.Source.Length == 1)
332 int FilterOptions (int i)
335 int x = Uni.ToWidthCompat (i);
340 i = textInfo.ToLower ((char) i);
342 i = Uni.ToKanaTypeInsensitive (i);
346 int previousChar = -1;
347 byte [] previousSortKey = null;
348 int previousChar2 = -1;
349 byte [] previousSortKey2 = null;
359 ExtenderType GetExtenderType (int i)
361 // LAMESPEC: Windows expects true for U+3005, but
362 // sometimes it does not represent to repeat just
364 // Windows also expects true for U+3031 and U+3032,
365 // but they should *never* repeat one character.
367 // U+2015 becomes an extender only when it is Japanese
369 return lcid == 16 ? ExtenderType.Conditional :
372 if (i < 0x3005 || i > 0xFF70)
373 return ExtenderType.None;
378 return ExtenderType.Simple;
380 return ExtenderType.Conditional;
383 return ExtenderType.Voiced;
387 return ExtenderType.None;
389 case 0x3005: // LAMESPEC: see above.
390 return ExtenderType.Buggy;
391 case 0x3031: // LAMESPEC: see above.
392 case 0x3032: // LAMESPEC: see above.
395 return ExtenderType.Simple;
398 return ExtenderType.Voiced;
400 return ExtenderType.Conditional;
402 return ExtenderType.None;
406 byte ToDashTypeValue (ExtenderType ext)
408 if (ignoreNonSpace) // LAMESPEC: huh, why?
411 case ExtenderType.None:
413 case ExtenderType.Conditional:
420 int FilterExtender (int i, ExtenderType ext)
422 if (ext == ExtenderType.Conditional &&
423 Uni.HasSpecialWeight ((char) i)) {
424 bool half = IsHalfKana ((char) i);
425 bool katakana = !Uni.IsHiragana ((char) i);
426 switch (Level1 (i) & 7) {
428 return half ? 0xFF71 : katakana ? 0x30A2 : 0x3042;
430 return half ? 0xFF72 : katakana ? 0x30A4 : 0x3044;
432 return half ? 0xFF73 : katakana ? 0x30A6 : 0x3046;
434 return half ? 0xFF74 : katakana ? 0x30A8 : 0x3048;
436 return half ? 0xFF75 : katakana ? 0x30AA : 0x304A;
442 bool IsIgnorable (int i)
444 return Uni.IsIgnorable (i) ||
445 ignoreSymbols && Uni.IsIgnorableSymbol (i) ||
446 ignoreNonSpace && Uni.IsIgnorableNonSpacing (i);
451 return i / 8 >= unsafeFlags.Length ? true : (unsafeFlags [i / 8] & (1 << (i % 8))) == 0;
456 public SortKey GetSortKey (string s)
458 return GetSortKey (s, CompareOptions.None);
461 public SortKey GetSortKey (string s, CompareOptions options)
463 return GetSortKey (s, 0, s.Length, options);
466 public SortKey GetSortKey (string s, int start, int length, CompareOptions options)
468 SetOptions (options);
470 buf.Initialize (options, lcid, s, frenchSort);
471 int end = start + length;
472 GetSortKey (s, start, end);
473 return buf.GetResultAndReset ();
476 void GetSortKey (string s, int start, int end)
478 for (int n = start; n < end; n++) {
481 ExtenderType ext = GetExtenderType (i);
482 if (ext != ExtenderType.None) {
483 i = FilterExtender (previousChar, ext);
485 FillSortKeyRaw (i, ext);
486 else if (previousSortKey != null) {
487 byte [] b = previousSortKey;
491 b [2] != 1 ? b [2] : Level2 (i, ext),
492 b [3] != 1 ? b [3] : Uni.Level3 (i));
494 // otherwise do nothing.
495 // (if the extender is the first char
496 // in the string, then just ignore.)
502 i = FilterOptions (i);
504 Contraction ct = GetContraction (s, n, end);
506 if (ct.Replacement != null) {
507 GetSortKey (ct.Replacement, 0, ct.Replacement.Length);
509 byte [] b = ct.SortKey;
513 b [2] != 1 ? b [2] : Level2 (i, ext),
514 b [3] != 1 ? b [3] : Uni.Level3 (i));
518 n += ct.Source.Length - 1;
521 if (!Uni.IsIgnorableNonSpacing (i))
523 FillSortKeyRaw (i, ExtenderType.None);
528 void FillSortKeyRaw (int i, ExtenderType ext)
530 if (0x3400 <= i && i <= 0x4DB5) {
531 int diff = i - 0x3400;
532 buf.AppendCJKExtension (
533 (byte) (0x10 + diff / 254),
534 (byte) (diff % 254 + 2));
538 UnicodeCategory uc = char.GetUnicodeCategory ((char) i);
540 case UnicodeCategory.PrivateUse:
541 int diff = i - 0xE000;
543 (byte) (0xE5 + diff / 254),
544 (byte) (diff % 254 + 2),
548 case UnicodeCategory.Surrogate:
549 FillSurrogateSortKeyRaw (i);
553 byte level2 = Level2 (i, ext);
554 if (Uni.HasSpecialWeight ((char) i)) {
555 byte level1 = Level1 (i);
561 Uni.IsJapaneseSmallLetter ((char) i),
562 ToDashTypeValue (ext),
563 !Uni.IsHiragana ((char) i),
564 IsHalfKana ((char) i)
566 if (!ignoreNonSpace && ext == ExtenderType.Voiced)
567 // Append voice weight
568 buf.AppendNormal (1, 1, 1, 0);
578 void FillSurrogateSortKeyRaw (int i)
587 lower = (byte) ((i == 0xD800) ? 0x3E : 0x3F);
588 } else if (0xD840 <= i && i < 0xD880) {
592 } else if (0xDB80 <= i && i < 0xDC00) {
593 diffbase = 0xDB80 - 0x40;
597 diffbase = 0xDC00 - 0xF8 + 2;
601 int diff = i - diffbase;
604 (byte) (segment + diff / 254),
605 (byte) (diff % 254 + 2),
614 public int Compare (string s1, string s2)
616 return Compare (s1, s2, CompareOptions.None);
619 public int Compare (string s1, string s2, CompareOptions options)
621 return Compare (s1, 0, s1.Length, s2, 0, s2.Length, options);
626 public string Source;
633 // Those instances are reused not to invoke instantiation
635 Escape escape1 = new Escape ();
636 Escape escape2 = new Escape ();
638 private int CompareOrdinal (string s1, int idx1, int len1,
639 string s2, int idx2, int len2)
641 int min = len1 < len2 ? len1 : len2;
642 int end1 = idx1 + min;
643 int end2 = idx2 + min;
644 for (int i1 = idx1, i2 = idx2;
645 i1 < end1 && i2 < end2; i1++, i2++)
646 if (s1 [i1] != s2 [i2])
647 return s1 [i1] - s2 [i2];
648 return len1 == len2 ? 0 :
649 len1 == min ? - 1 : 1;
652 public int Compare (string s1, int idx1, int len1,
653 string s2, int idx2, int len2, CompareOptions options)
655 // quick equality check
656 if (idx1 == idx2 && len1 == len2 &&
657 Object.ReferenceEquals (s1, s2))
659 // FIXME: this should be done inside Compare() at
661 // int ord = CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
664 if (options == CompareOptions.Ordinal)
665 return CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
667 #if false // stable easy version, depends on GetSortKey().
668 SortKey sk1 = GetSortKey (s1, idx1, len1, options);
669 SortKey sk2 = GetSortKey (s2, idx2, len2, options);
670 byte [] d1 = sk1.KeyData;
671 byte [] d2 = sk2.KeyData;
672 int len = d1.Length > d2.Length ? d2.Length : d1.Length;
673 for (int i = 0; i < len; i++)
674 if (d1 [i] != d2 [i])
675 return d1 [i] < d2 [i] ? -1 : 1;
676 return d1.Length == d2.Length ? 0 : d1.Length < d2.Length ? -1 : 1;
678 SetOptions (options);
680 int ret = CompareInternal (s1, idx1, len1, s2, idx2, len2, (options & CompareOptions.StringSort) != 0, out dummy, out dummy2, true);
681 return ret == 0 ? 0 : ret < 0 ? -1 : 1;
685 int CompareInternal (string s1, int idx1, int len1, string s2,
686 int idx2, int len2, bool stringSort,
687 out bool targetConsumed, out bool sourceConsumed,
688 bool skipHeadingExtenders)
692 int end1 = idx1 + len1;
693 int end2 = idx2 + len2;
694 targetConsumed = false;
695 sourceConsumed = false;
697 // It holds final result that comes from the comparison
698 // at level 2 or lower. Even if Compare() found the
699 // difference at level 2 or lower, it still has to
700 // continue level 1 comparison. FinalResult is used
701 // when there was no further differences.
703 // It holds the comparison level to do. It starts from
704 // 5, and becomes 1 when we do primary-only comparison.
705 int currentLevel = 5;
712 // Skip heading extenders
713 if (skipHeadingExtenders) {
714 for (; idx1 < end1; idx1++)
715 if (GetExtenderType (s1 [idx1]) == ExtenderType.None)
717 for (; idx2 < end2; idx2++)
718 if (GetExtenderType (s2 [idx2]) == ExtenderType.None)
722 ExtenderType ext1 = ExtenderType.None;
723 ExtenderType ext2 = ExtenderType.None;
725 int quickCheckPos1 = idx1;
726 int quickCheckPos2 = idx2;
729 for (; idx1 < end1; idx1++)
730 if (!IsIgnorable (s1 [idx1]))
732 for (; idx2 < end2; idx2++)
733 if (!IsIgnorable (s2 [idx2]))
737 if (escape1.Source == null)
740 start1 = escape1.Start;
741 idx1 = escape1.Index;
743 quickCheckPos1 = escape1.Optional;
744 escape1.Source = null;
748 if (escape2.Source == null)
751 start2 = escape2.Start;
752 idx2 = escape2.Index;
754 quickCheckPos2 = escape2.Optional;
755 escape2.Source = null;
759 // If comparison is unstable, then this part is one of the most doubtful part.
760 // Here we run quick codepoint comparison and run back to "stable" area to
761 // compare characters.
763 // Strictly to say, even the first character
764 // could be compared here, but it messed
765 // backward step, so I just avoided mess.
766 if (quickCheckPos1 < idx1 && quickCheckPos2 < idx2) {
767 while (idx1 < end1 && idx2 < end2 &&
768 s1 [idx1] == s2 [idx2]) {
772 if (idx1 == end1 || idx2 == end2)
773 continue; // check replacement
775 int backwardEnd1 = quickCheckPos1;
776 int backwardEnd2 = quickCheckPos2;
777 quickCheckPos1 = idx1;
778 quickCheckPos2 = idx2;
783 for (;idx1 > backwardEnd1; idx1--)
784 if (Category (s1 [idx1]) != 1)
786 for (;idx2 > backwardEnd2; idx2--)
787 if (Category (s2 [idx2]) != 1)
789 for (;idx1 > backwardEnd1; idx1--)
790 if (IsSafe (s1 [idx1]))
792 for (;idx2 > backwardEnd2; idx2--)
793 if (IsSafe (s2 [idx2]))
802 int i1 = FilterOptions (s1 [idx1]);
803 int i2 = FilterOptions (s2 [idx2]);
804 bool special1 = false;
805 bool special2 = false;
807 // If current character is an expander, then
808 // repeat the previous character.
809 ext1 = GetExtenderType (i1);
810 if (ext1 != ExtenderType.None) {
811 if (previousChar < 0) {
812 if (previousSortKey == null) {
817 sk1 = previousSortKey;
820 i1 = FilterExtender (previousChar, ext1);
822 ext2 = GetExtenderType (i2);
823 if (ext2 != ExtenderType.None) {
824 if (previousChar2 < 0) {
825 if (previousSortKey2 == null) {
830 sk2 = previousSortKey2;
833 i2 = FilterExtender (previousChar2, ext2);
836 byte cat1 = Category (i1);
837 byte cat2 = Category (i2);
839 // Handle special weight characters
840 if (!stringSort && currentLevel > 4) {
842 lv5At1 = escape1.Source != null ?
843 escape1.Index - escape1.Start :
845 // here Windows has a bug that it does
846 // not consider thirtiary weight.
847 lv5Value1 = Level1 (i1) << 8 + Uni.Level3 (i1);
852 lv5At2 = escape2.Source != null ?
853 escape2.Index - escape2.Start :
855 // here Windows has a bug that it does
856 // not consider thirtiary weight.
857 lv5Value2 = Level1 (i2) << 8 + Uni.Level3 (i2);
861 if (cat1 == 6 || cat2 == 6) {
867 Contraction ct1 = null;
868 if (ext1 == ExtenderType.None)
869 ct1 = GetContraction (s1, idx1, end1);
874 else if (ct1 != null) {
875 offset1 = ct1.Source.Length;
876 if (ct1.SortKey != null) {
878 for (int i = 0; i < ct1.SortKey.Length; i++)
879 sk1 [i] = ct1.SortKey [i];
881 previousSortKey = sk1;
883 else if (escape1.Source == null) {
885 escape1.Start = start1;
886 escape1.Index = cur1 + ct1.Source.Length;
888 escape1.Optional = quickCheckPos1;
889 s1 = ct1.Replacement;
900 sk1 [1] = Level1 (i1);
901 if (!ignoreNonSpace && currentLevel > 1)
902 sk1 [2] = Level2 (i1, ext1);
903 if (currentLevel > 2)
904 sk1 [3] = Uni.Level3 (i1);
905 if (currentLevel > 3)
906 special1 = Uni.HasSpecialWeight ((char) i1);
911 Contraction ct2 = null;
912 if (ext2 == ExtenderType.None)
913 ct2 = GetContraction (s2, idx2, end2);
917 else if (ct2 != null) {
918 idx2 += ct2.Source.Length;
919 if (ct2.SortKey != null) {
921 for (int i = 0; i < ct2.SortKey.Length; i++)
922 sk2 [i] = ct2.SortKey [i];
924 previousSortKey2 = sk2;
926 else if (escape2.Source == null) {
928 escape2.Start = start2;
929 escape2.Index = cur2 + ct2.Source.Length;
931 escape2.Optional = quickCheckPos2;
932 s2 = ct2.Replacement;
943 sk2 [1] = Level1 (i2);
944 if (!ignoreNonSpace && currentLevel > 1)
945 sk2 [2] = Level2 (i2, ext2);
946 if (currentLevel > 2)
947 sk2 [3] = Uni.Level3 (i2);
948 if (currentLevel > 3)
949 special2 = Uni.HasSpecialWeight ((char) i2);
955 // Add offset here so that it does not skip
956 // idx1 while s2 has a replacement.
959 // add diacritical marks in s1 here
960 if (!ignoreNonSpace) {
961 while (idx1 < end1) {
962 if (Category (s1 [idx1]) != 1)
966 sk1 [2] = (byte) (sk1 [2] +
967 Level2 (s1 [idx1], ExtenderType.None));
971 // add diacritical marks in s2 here
972 while (idx2 < end2) {
973 if (Category (s2 [idx2]) != 1)
977 sk2 [2] = (byte) (sk2 [2] +
978 Level2 (s2 [idx2], ExtenderType.None));
983 int ret = sk1 [0] - sk2 [0];
984 ret = ret != 0 ? ret : sk1 [1] - sk2 [1];
987 if (currentLevel == 1)
989 if (!ignoreNonSpace) {
990 ret = sk1 [2] - sk2 [2];
993 currentLevel = frenchSort ? 2 : 1;
997 if (currentLevel == 2)
999 ret = sk1 [3] - sk2 [3];
1005 if (currentLevel == 3)
1007 if (special1 != special2) {
1008 finalResult = special1 ? 1 : -1;
1013 ret = CompareFlagPair (
1014 !Uni.IsJapaneseSmallLetter ((char) i1),
1015 !Uni.IsJapaneseSmallLetter ((char) i2));
1016 ret = ret != 0 ? ret :
1017 ToDashTypeValue (ext1) -
1018 ToDashTypeValue (ext2);
1019 ret = ret != 0 ? ret : CompareFlagPair (
1020 Uni.IsHiragana ((char) i1),
1021 Uni.IsHiragana ((char) i2));
1022 ret = ret != 0 ? ret : CompareFlagPair (
1023 !IsHalfKana ((char) i1),
1024 !IsHalfKana ((char) i2));
1033 // If there were only level 3 or lower differences,
1034 // then we still have to find diacritical differences
1036 if (!ignoreNonSpace &&
1037 finalResult != 0 && currentLevel > 2) {
1038 while (idx1 < end1 && idx2 < end2) {
1039 if (!Uni.IsIgnorableNonSpacing (s1 [idx1]))
1041 if (!Uni.IsIgnorableNonSpacing (s2 [idx2]))
1043 finalResult = Level2 (FilterOptions ((s1 [idx1])), ext1) - Level2 (FilterOptions (s2 [idx2]), ext2);
1044 if (finalResult != 0)
1048 // they should work only for the first character
1049 ext1 = ExtenderType.None;
1050 ext2 = ExtenderType.None;
1053 if (currentLevel == 1 && finalResult != 0) {
1055 if (Uni.IsIgnorableNonSpacing (s1 [idx1]))
1058 if (Uni.IsIgnorableNonSpacing (s2 [idx2]))
1061 // we still have to handle level 5
1062 if (finalResult == 0) {
1063 finalResult = lv5At1 - lv5At2;
1064 if (finalResult == 0)
1065 finalResult = lv5Value1 - lv5Value2;
1067 if (finalResult == 0) {
1069 targetConsumed = true;
1071 sourceConsumed = true;
1073 return idx1 != end1 ? 1 : idx2 == end2 ? finalResult : -1;
1076 int CompareFlagPair (bool b1, bool b2)
1078 return b1 == b2 ? 0 : b1 ? 1 : -1;
1083 #region IsPrefix() and IsSuffix()
1085 public bool IsPrefix (string src, string target, CompareOptions opt)
1087 return IsPrefix (src, target, 0, src.Length, opt);
1090 public bool IsPrefix (string s, string target, int start, int length, CompareOptions opt)
1093 return IsPrefix (s, target, start, length,
1094 (opt & CompareOptions.StringSort) != 0, true);
1097 public bool IsPrefix (string s, string target, int start, int length, bool stringSort, bool skipHeadingExtenders)
1099 bool consumed, dummy;
1100 int ret = CompareInternal (s, start, length,
1101 target, 0, target.Length, stringSort,
1102 out consumed, out dummy, skipHeadingExtenders);
1108 public bool IsSuffix (string src, string target, CompareOptions opt)
1110 return IsSuffix (src, target, src.Length - 1, src.Length, opt);
1113 public bool IsSuffix (string s, string target, int start, int length, CompareOptions opt)
1115 // quick check : simple codepoint comparison
1116 if (s.Length >= target.Length) {
1118 int se = start - length;
1119 for (int i = target.Length - 1; si >= se && i >= 0; i--, si--)
1120 if (s [si] != target [i])
1122 if (si == start + target.Length)
1127 return IsSuffix (s, target, start, length,
1128 (opt & CompareOptions.StringSort) != 0);
1131 bool IsSuffix (string s, string t, int start, int length,
1135 for (;tstart < t.Length; tstart++)
1136 if (!IsIgnorable (t [tstart]))
1138 if (tstart == t.Length)
1139 return true; // as if target is String.Empty.
1142 // This is still not working. If it is not likely to get working, then just remove it.
1144 int send = start - length;
1145 int ti = t.Length - 1;
1148 int sStep = start + 1;
1149 int tStep = t.Length;
1150 bool sourceConsumed, targetConsumed;
1152 for (; send < si; si--)
1153 if (!IsIgnorable (s [si]))
1155 for (; tend < ti; ti--)
1156 if (!IsIgnorable (t [ti]))
1160 for (; send < si; si--)
1161 if (IsSafe (s [si]))
1163 for (; tend < ti; ti--)
1164 if (IsSafe (t [ti]))
1166 Console.WriteLine ("==== {0} {1} {2} {3} {4} {5} {6} {7} {8}", s, si, send, length, t, ti, tstart, sStep - si, tStep - ti);
1167 if (CompareInternal (s, si, sStep - si,
1168 t, ti, tStep - ti, stringSort,
1169 out targetConsumed, out sourceConsumed,
1181 // FIXME: it is not efficient for very long target.
1182 bool sourceConsumed, targetConsumed;
1183 int mismatchCount = 0;
1184 for (int i = 0; i < length; i++) {
1185 escape1.Source = escape2.Source = null;
1186 previousSortKey = previousSortKey2 = null;
1187 previousChar = previousChar2 = -1;
1189 int ret = CompareInternal (s, start - i, i + 1,
1190 t, tstart, t.Length - tstart,
1191 stringSort, out targetConsumed,
1192 out sourceConsumed, true);
1195 if (!sourceConsumed && targetConsumed)
1197 if (!targetConsumed) {
1199 if (mismatchCount > Uni.MaxExpansionLength)
1200 // The largest length of an
1201 // expansion is 3, so if the
1202 // target was not consumed more
1203 // than 3 times, then the tail
1204 // character does not match.
1214 #region IndexOf() / LastIndexOf()
1216 public int IndexOf (string s, string target, CompareOptions opt)
1218 return IndexOf (s, target, 0, s.Length, opt);
1221 public int IndexOf (string s, string target, int start, int length, CompareOptions opt)
1224 return IndexOf (s, target, start, length,
1225 (opt & CompareOptions.StringSort) != 0);
1228 public int IndexOf (string s, char target, CompareOptions opt)
1230 return IndexOf (s, target, 0, s.Length, opt);
1233 public int IndexOf (string s, char target, int start, int length, CompareOptions opt)
1237 // If target is contraction, then use string search.
1238 Contraction ct = GetContraction (target);
1240 if (ct.Replacement != null)
1241 return IndexOf (s, ct.Replacement, start, length,
1242 (opt & CompareOptions.StringSort) != 0);
1244 return IndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true);
1246 int ti = FilterOptions ((int) target);
1247 charSortKeyIndexTarget [0] = Category (ti);
1248 charSortKeyIndexTarget [1] = Level1 (ti);
1249 if (!ignoreNonSpace)
1250 charSortKeyIndexTarget [2] =
1251 Level2 (ti, ExtenderType.None);
1252 charSortKeyIndexTarget [3] = Uni.Level3 (ti);
1253 return IndexOfSortKey (s, start, length,
1254 charSortKeyIndexTarget, target, ti,
1255 !Uni.HasSpecialWeight ((char) ti));
1259 // Searches target byte[] keydata
1260 int IndexOfSortKey (string s, int start, int length, byte [] sortkey, char target, int ti, bool noLv4)
1262 int end = start + length;
1266 if (MatchesForward (s, ref idx, end, ti, sortkey, noLv4))
1272 // Searches string. Search head character (or keydata when
1273 // the head is contraction sortkey) and try IsPrefix().
1274 int IndexOf (string s, string target, int start, int length, bool stringSort)
1277 for (; tidx < target.Length; tidx++)
1278 if (!IsIgnorable (target [tidx]))
1280 if (tidx == target.Length)
1282 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1283 string replace = ct != null ? ct.Replacement : null;
1284 byte [] sk = replace == null ? charSortKeyIndexTarget : null;
1286 char tc = char.MinValue;
1288 if (ct != null && sk != null) {
1289 for (int i = 0; i < ct.SortKey.Length; i++)
1290 sk [i] = ct.SortKey [i];
1291 } else if (sk != null) {
1293 ti = FilterOptions (target [tidx]);
1294 sk [0] = Category (ti);
1295 sk [1] = Level1 (ti);
1296 if (!ignoreNonSpace)
1297 sk [2] = Level2 (ti, ExtenderType.None);
1298 sk [3] = Uni.Level3 (ti);
1299 noLv4 = !Uni.HasSpecialWeight ((char) ti);
1302 for (tidx++; tidx < target.Length; tidx++) {
1303 if (Category (target [tidx]) != 1)
1307 sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None));
1313 if (replace != null)
1314 idx = IndexOf (s, replace, start, length, stringSort);
1316 idx = IndexOfSortKey (s, start, length, sk, tc, ti, noLv4);
1319 length -= idx - start;
1321 if (IsPrefix (s, target, start, length, stringSort, false))
1323 Contraction cts = GetContraction (s, start, length);
1325 start += cts.Source.Length;
1326 length -= cts.Source.Length;
1331 } while (length > 0);
1336 // There are the same number of IndexOf() related methods,
1337 // with the same functionalities for each.
1340 public int LastIndexOf (string s, string target, CompareOptions opt)
1342 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1345 public int LastIndexOf (string s, string target, int start, int length, CompareOptions opt)
1348 return LastIndexOf (s, target, start, length,
1349 (opt & CompareOptions.StringSort) != 0);
1352 public int LastIndexOf (string s, char target, CompareOptions opt)
1354 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1357 public int LastIndexOf (string s, char target, int start, int length, CompareOptions opt)
1361 // If target is contraction, then use string search.
1362 Contraction ct = GetContraction (target);
1364 if (ct.Replacement != null)
1365 return LastIndexOf (s, ct.Replacement, start, length, (opt & CompareOptions.StringSort) != 0);
1367 return LastIndexOfSortKey (s, start, start, length, ct.SortKey, char.MinValue, -1, true);
1370 int ti = FilterOptions ((int) target);
1371 charSortKeyIndexTarget [0] = Category (ti);
1372 charSortKeyIndexTarget [1] = Level1 (ti);
1373 if (!ignoreNonSpace)
1374 charSortKeyIndexTarget [2] = Level2 (ti, ExtenderType.None);
1375 charSortKeyIndexTarget [3] = Uni.Level3 (ti);
1376 return LastIndexOfSortKey (s, start, start, length, charSortKeyIndexTarget, target, ti, !Uni.HasSpecialWeight ((char) ti));
1380 // Searches target byte[] keydata
1381 int LastIndexOfSortKey (string s, int start, int orgStart, int length, byte [] sortkey, char target, int ti, bool noLv4)
1383 int end = start - length;
1387 if (MatchesBackward (s, ref idx, end, orgStart, ti, sortkey, noLv4))
1393 // Searches string. Search head character (or keydata when
1394 // the head is contraction sortkey) and try IsPrefix().
1395 int LastIndexOf (string s, string target, int start, int length, bool stringSort)
1397 int orgStart = start;
1399 for (; tidx < target.Length; tidx++)
1400 if (!IsIgnorable (target [tidx]))
1402 if (tidx == target.Length)
1404 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1405 string replace = ct != null ? ct.Replacement : null;
1406 byte [] sk = replace == null ? charSortKeyIndexTarget : null;
1409 char tc = char.MinValue;
1411 if (ct != null && sk != null) {
1412 for (int i = 0; i < ct.SortKey.Length; i++)
1413 sk [i] = ct.SortKey [i];
1414 } else if (sk != null) {
1416 ti = FilterOptions (target [tidx]);
1417 sk [0] = Category (ti);
1418 sk [1] = Level1 (ti);
1419 if (!ignoreNonSpace)
1420 sk [2] = Level2 (ti, ExtenderType.None);
1421 sk [3] = Uni.Level3 (ti);
1422 noLv4 = !Uni.HasSpecialWeight ((char) ti);
1425 for (tidx++; tidx < target.Length; tidx++) {
1426 if (Category (target [tidx]) != 1)
1430 sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None));
1437 if (replace != null)
1438 idx = LastIndexOf (s, replace, start, length, stringSort);
1440 idx = LastIndexOfSortKey (s, start, orgStart, length, sk, tc, ti, noLv4);
1443 length -= start - idx;
1446 if (IsPrefix (s, target, idx, orgStart - idx + 1, stringSort, false)) {
1447 for (;idx < orgStart; idx++)
1448 if (!IsIgnorable (s [idx]))
1452 Contraction cts = GetContraction (s, idx, orgStart - idx + 1);
1454 start -= cts.Source.Length;
1455 length -= cts.Source.Length;
1460 } while (length > 0);
1464 private bool MatchesForward (string s, ref int idx, int end, int ti, byte [] sortkey, bool noLv4)
1467 ExtenderType ext = GetExtenderType (s [idx]);
1468 Contraction ct = null;
1469 if (ext == ExtenderType.None)
1470 ct = GetContraction (s, idx, end);
1471 else if (previousChar < 0) {
1472 if (previousSortKey == null) {
1476 charSortKey = previousSortKey;
1479 si = FilterExtender (previousChar, ext);
1480 // if lv4 exists, it never matches contraction
1482 idx += ct.Source.Length;
1485 if (ct.SortKey != null) {
1486 for (int i = 0; i < sortkey.Length; i++)
1487 charSortKey [i] = sortkey [i];
1489 previousSortKey = charSortKey;
1491 // Here is the core of LAMESPEC
1492 // described at the top of the source.
1494 return MatchesForward (ct.Replacement, ref dummy,
1495 ct.Replacement.Length, ti, sortkey, noLv4);
1499 si = FilterOptions (s [idx]);
1501 charSortKey [0] = Category (si);
1502 bool noMatch = false;
1503 if (sortkey [0] == charSortKey [0])
1504 charSortKey [1] = Level1 (si);
1507 if (!ignoreNonSpace && sortkey [1] == charSortKey [1])
1508 charSortKey [2] = Level2 (si, ext);
1509 else if (!ignoreNonSpace)
1512 for (; idx < end; idx++) {
1513 if (Category (s [idx]) != 1)
1518 charSortKey [3] = Uni.Level3 (si);
1519 if (charSortKey [0] != 1)
1522 for (; idx < end; idx++) {
1523 if (Category (s [idx]) != 1)
1527 if (charSortKey [2] == 0)
1528 charSortKey [2] = 2;
1529 charSortKey [2] = (byte) (charSortKey [2]
1530 + Level2 (s [idx], ExtenderType.None));
1533 return MatchesPrimitive (charSortKey, si, ext, sortkey, ti, noLv4);
1536 private bool MatchesPrimitive (byte [] charSortKey, int si, ExtenderType ext, byte [] sortkey, int ti, bool noLv4)
1538 if (charSortKey [0] != sortkey [0] ||
1539 charSortKey [1] != sortkey [1] ||
1540 (!ignoreNonSpace && charSortKey [2] != sortkey [2]) ||
1541 charSortKey [3] != sortkey [3])
1543 if (noLv4 && (si < 0 || !Uni.HasSpecialWeight ((char) si)))
1547 // Since target can never be an extender, if the source
1548 // is an expander and it matters, then they never match.
1549 if (!ignoreNonSpace && ext == ExtenderType.Conditional)
1551 if (Uni.IsJapaneseSmallLetter ((char) si) !=
1552 Uni.IsJapaneseSmallLetter ((char) ti) ||
1553 ToDashTypeValue (ext) !=
1554 // FIXME: we will have to specify correct value for target
1555 ToDashTypeValue (ExtenderType.None) ||
1556 !Uni.IsHiragana ((char) si) !=
1557 !Uni.IsHiragana ((char) ti) ||
1558 IsHalfKana ((char) si) !=
1559 IsHalfKana ((char) ti))
1564 private bool MatchesBackward (string s, ref int idx, int end, int orgStart, int ti, byte [] sortkey, bool noLv4)
1568 ExtenderType ext = GetExtenderType (s [idx]);
1569 // To handle extenders in source, we need to
1570 // check next _primary_ character.
1571 if (ext != ExtenderType.None) {
1572 byte diacritical = 0;
1573 for (int tmp = 0; ; tmp--) {
1574 if (tmp < 0) // heading extender
1576 if (IsIgnorable (s [tmp]))
1578 int tmpi = FilterOptions (s [tmp]);
1579 byte category = Category (tmpi);
1580 if (category == 1) {
1581 diacritical = Level2 (tmpi, ExtenderType.None);
1584 si = FilterExtender (tmpi, ext);
1585 charSortKey [0] = category;
1586 charSortKey [1] = Level1 (si);
1587 if (!ignoreNonSpace)
1588 charSortKey [2] = Level2 (si, ext);
1589 charSortKey [3] = Uni.Level3 (si);
1590 if (ext != ExtenderType.Conditional &&
1593 (charSortKey [2] == 0) ?
1594 (byte) (diacritical + 2) :
1600 Contraction ct = null;
1601 if (ext == ExtenderType.None)
1602 ct = GetContraction (s, idx, end);
1603 // if lv4 exists, it never matches contraction
1605 idx -= ct.Source.Length;
1608 if (ct.SortKey != null) {
1609 for (int i = 0; i < sortkey.Length; i++)
1610 charSortKey [i] = sortkey [i];
1612 previousSortKey = charSortKey;
1614 // Here is the core of LAMESPEC
1615 // described at the top of the source.
1616 int dummy = ct.Replacement.Length - 1;
1617 return MatchesBackward (ct.Replacement,
1618 ref dummy, dummy, -1, ti, sortkey, noLv4);
1620 } else if (ext == ExtenderType.None) {
1622 si = FilterOptions (s [idx]);
1624 bool noMatch = false;
1625 charSortKey [0] = Category (si);
1626 if (charSortKey [0] == sortkey [0])
1627 charSortKey [1] = Level1 (si);
1630 if (!ignoreNonSpace && charSortKey [1] == sortkey [1])
1631 charSortKey [2] = Level2 (si, ext);
1632 else if (!ignoreNonSpace)
1636 charSortKey [3] = Uni.Level3 (si);
1637 if (charSortKey [0] != 1)
1640 if (ext == ExtenderType.None) {
1641 for (int tmp = cur + 1; tmp < orgStart; tmp++) {
1642 if (Category (s [tmp]) != 1)
1646 if (charSortKey [2] == 0)
1647 charSortKey [2] = 2;
1648 charSortKey [2] = (byte) (charSortKey [2]
1649 + Level2 (s [tmp], ExtenderType.None));
1652 return MatchesPrimitive (charSortKey, si, ext, sortkey, ti, noLv4);