4 // This class will demonstrate CompareInfo functionality that will just work.
8 // Here's a summary for supporting contractions, expansions and diacritical
11 // Diacritical mapping is a simple tailoring rule that "remaps" diacritical
12 // weight value from one to another. For now it applies to all range of
13 // characters, but at some stage we might need to limit the remapping ranges.
15 // A Contraction consists of a string (actually char[]) and a sortkey entry
16 // (i.e. byte[]). It indicates that a sequence of characters is interpreted
17 // as a single character that has the mapped sortkey value. There is no
18 // character which goes across "special rules". When the collator encountered
19 // such a sequence of characters, it just returns the sortkey value without
20 // further replacement.
22 // Since it is still likely to happen that a contraction sequence matches
23 // other character than the identical sequence (depending on CompareOptions
24 // and of course, defined sortkey value itself), comparison cannot be done
25 // at source char[] level.
27 // In IndexOf(), the first character in the target string or the target char
28 // itself is turned into sortkey bytes. If the character has a contraction and
29 // that is sortkey map, then it is used instead. If the contraction exists and
30 // that is replacement map, then the first character of the replacement string
31 // is searched instead. IndexOf() always searches only for the top character,
32 // and if it returned negative value, then it returns -1. Otherwise, it then
33 // tries IsPrefix() from that location. If it returns true, then it returns
36 // LAMESPEC: IndexOf() is lame as a whole API. It never matches in the middle
37 // of expansion and there is no proper way to return such indexes within
38 // a single int return value.
40 // For example, try below in .NET:
41 // IndexOf("\u00E6", "a")
42 // IndexOf("\u00E6", "e")
47 using System.Collections;
48 using System.Globalization;
50 using Uni = Mono.Globalization.Unicode.MSCompatUnicodeTable;
51 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
53 namespace Mono.Globalization.Unicode
55 internal class SimpleCollator
57 static SimpleCollator invariant =
58 new SimpleCollator (CultureInfo.InvariantCulture);
61 // CompareOptions expanded.
62 bool ignoreNonSpace; // used in IndexOf()
67 TextInfo textInfo; // for ToLower().
69 unsafe readonly byte* cjkCatTable;
70 unsafe readonly byte* cjkLv1Table;
71 readonly CodePointIndexer cjkIndexer;
72 unsafe readonly byte* cjkLv2Table;
73 readonly CodePointIndexer cjkLv2Indexer;
75 readonly Contraction [] contractions;
76 readonly Level2Map [] level2Maps;
78 // This flag marks characters as "unsafe", where the character
79 // could be used as part of a contraction (whose length > 1).
80 readonly byte [] unsafeFlags;
82 const int UnsafeFlagLength = 0x300 / 8;
84 // temporary sortkey buffer for index search/comparison
85 byte [] charSortKey = new byte [4];
86 byte [] charSortKey2 = new byte [4];
87 byte [] charSortKeyIndexTarget = new byte [4];
89 #region .ctor() and split functions
91 public SimpleCollator (CultureInfo culture)
94 textInfo = culture.TextInfo;
95 buf = new SortKeyBuffer (culture.LCID);
98 SetCJKTable (culture, ref cjkIndexer,
99 ref cjkCatTable, ref cjkLv1Table,
100 ref cjkLv2Indexer, ref cjkLv2Table);
103 // Get tailoring info
104 TailoringInfo t = null;
105 for (CultureInfo ci = culture; ci.LCID != 127; ci = ci.Parent) {
106 t = Uni.GetTailoringInfo (ci.LCID);
110 if (t == null) // then use invariant
111 t = Uni.GetTailoringInfo (127);
113 frenchSort = t.FrenchSort;
114 Uni.BuildTailoringTables (culture, t, ref contractions,
116 unsafeFlags = new byte [UnsafeFlagLength];
117 foreach (Contraction c in contractions)
118 if (c.Source.Length > 1)
119 foreach (char ch in c.Source)
120 unsafeFlags [(int) ch / 8 ]
121 |= (byte) ((int) ch % 8);
123 foreach (Contraction c in invariant.contractions)
124 if (c.Source.Length > 1)
125 foreach (char ch in c.Source)
126 unsafeFlags [(int) ch / 8 ]
127 |= (byte) ((int) ch % 8);
129 // FIXME: Since tailorings are mostly for latin
130 // (and in some cases Cyrillic) characters, it would
131 // be much better for performance to store "start
132 // indexes" for > 370 (culture-specific letters).
135 // dump tailoring table
136 Console.WriteLine ("******** building table for {0} : c - {1} d - {2}",
137 culture.LCID, contractions.Length, level2Maps.Length);
138 foreach (Contraction c in contractions) {
139 foreach (char cc in c.Source)
140 Console.Write ("{0:X4} ", (int) cc);
141 Console.WriteLine (" -> '{0}'", c.Replacement);
146 unsafe private void SetCJKTable (
147 CultureInfo culture, ref CodePointIndexer cjkIndexer,
148 ref byte* catTable, ref byte* lv1Table,
149 ref CodePointIndexer lv2Indexer, ref byte* lv2Table)
151 string name = GetNeutralCulture (culture).Name;
153 Uni.FillCJK (name, ref cjkIndexer, ref catTable,
154 ref lv1Table, ref lv2Indexer, ref lv2Table);
157 static CultureInfo GetNeutralCulture (CultureInfo info)
159 CultureInfo ret = info;
160 while (ret.Parent != null && ret.Parent.LCID != 127)
167 unsafe byte Category (int cp)
169 if (cp < 0x3000 || cjkCatTable == null)
170 return Uni.Category (cp);
171 int idx = cjkIndexer.ToIndex (cp);
172 return idx < 0 ? Uni.Category (cp) : cjkCatTable [idx];
175 unsafe byte Level1 (int cp)
177 if (cp < 0x3000 || cjkLv1Table == null)
178 return Uni.Level1 (cp);
179 int idx = cjkIndexer.ToIndex (cp);
180 return idx < 0 ? Uni.Level1 (cp) : cjkLv1Table [idx];
183 unsafe byte Level2 (int cp, ExtenderType ext)
185 if (ext == ExtenderType.Buggy)
187 else if (ext == ExtenderType.Conditional)
190 if (cp < 0x3000 || cjkLv2Table == null)
191 return Uni.Level2 (cp);
192 int idx = cjkLv2Indexer.ToIndex (cp);
193 byte ret = idx < 0 ? (byte) 0 : cjkLv2Table [idx];
196 ret = Uni.Level2 (cp);
197 if (level2Maps.Length == 0)
199 for (int i = 0; i < level2Maps.Length; i++) {
200 if (level2Maps [i].Source == ret)
201 return level2Maps [i].Replace;
202 else if (level2Maps [i].Source > ret)
208 bool IsHalfKana (int cp)
210 return ignoreWidth || Uni.IsHalfWidthKana ((char) cp);
213 void SetOptions (CompareOptions options)
215 this.ignoreNonSpace = (options & CompareOptions.IgnoreNonSpace) != 0;
216 this.ignoreSymbols = (options & CompareOptions.IgnoreSymbols) != 0;
217 this.ignoreWidth = (options & CompareOptions.IgnoreWidth) != 0;
218 this.ignoreCase = (options & CompareOptions.IgnoreCase) != 0;
219 this.ignoreKanaType = (options & CompareOptions.IgnoreKanaType) != 0;
220 previousChar = previousChar2 = -1;
221 previousSortKey = previousSortKey2 = null;
222 escape1.Source = escape2.Source = null;
225 Contraction GetContraction (string s, int start, int end)
227 Contraction c = GetContraction (s, start, end, contractions);
228 if (c != null || lcid == 127)
230 return GetContraction (s, start, end, invariant.contractions);
233 Contraction GetContraction (string s, int start, int end, Contraction [] clist)
235 for (int i = 0; i < clist.Length; i++) {
236 Contraction ct = clist [i];
237 int diff = ct.Source [0] - s [start];
239 return null; // it's already sorted
242 char [] chars = ct.Source;
243 if (end - start < chars.Length)
246 for (int n = 0; n < chars.Length; n++)
247 if (s [start + n] != chars [n]) {
257 Contraction GetTailContraction (string s, int start, int end)
259 Contraction c = GetTailContraction (s, start, end, contractions);
260 if (c != null || lcid == 127)
262 return GetTailContraction (s, start, end, invariant.contractions);
265 Contraction GetTailContraction (string s, int start, int end, Contraction [] clist)
267 for (int i = 0; i < clist.Length; i++) {
268 Contraction ct = clist [i];
269 int diff = ct.Source [0] - s [end];
271 return null; // it's already sorted
274 char [] chars = ct.Source;
275 if (start - end + 1 < chars.Length)
278 int offset = start - chars.Length + 1;
279 for (int n = 0; n < chars.Length; n++)
280 if (s [offset + n] != chars [n]) {
290 Contraction GetContraction (char c)
292 Contraction ct = GetContraction (c, contractions);
293 if (ct != null || lcid == 127)
295 return GetContraction (c, invariant.contractions);
298 Contraction GetContraction (char c, Contraction [] clist)
300 for (int i = 0; i < clist.Length; i++) {
301 Contraction ct = clist [i];
302 if (ct.Source [0] > c)
303 return null; // it's already sorted
304 if (ct.Source [0] == c && ct.Source.Length == 1)
310 int FilterOptions (int i)
313 int x = Uni.ToWidthCompat (i);
318 i = textInfo.ToLower ((char) i);
320 i = Uni.ToKanaTypeInsensitive (i);
324 int previousChar = -1;
325 byte [] previousSortKey = null;
326 int previousChar2 = -1;
327 byte [] previousSortKey2 = null;
337 ExtenderType GetExtenderType (int i)
339 // LAMESPEC: Windows expects true for U+3005, but
340 // sometimes it does not represent to repeat just
342 // Windows also expects true for U+3031 and U+3032,
343 // but they should *never* repeat one character.
345 // U+2015 becomes an extender only when it is Japanese
347 return lcid == 16 ? ExtenderType.Conditional :
350 if (i < 0x3005 || i > 0xFF70)
351 return ExtenderType.None;
356 return ExtenderType.Simple;
358 return ExtenderType.Conditional;
361 return ExtenderType.Voiced;
365 return ExtenderType.None;
367 case 0x3005: // LAMESPEC: see above.
368 return ExtenderType.Buggy;
369 case 0x3031: // LAMESPEC: see above.
370 case 0x3032: // LAMESPEC: see above.
373 return ExtenderType.Simple;
376 return ExtenderType.Voiced;
378 return ExtenderType.Conditional;
380 return ExtenderType.None;
384 byte ToDashTypeValue (ExtenderType ext)
386 if (ignoreNonSpace) // LAMESPEC: huh, why?
389 case ExtenderType.None:
391 case ExtenderType.Conditional:
398 int FilterExtender (int i, ExtenderType ext)
400 if (ext == ExtenderType.Conditional &&
401 Uni.HasSpecialWeight ((char) i)) {
402 bool half = IsHalfKana ((char) i);
403 bool katakana = !Uni.IsHiragana ((char) i);
404 switch (Level1 (i) & 7) {
406 return half ? 0xFF71 : katakana ? 0x30A2 : 0x3042;
408 return half ? 0xFF72 : katakana ? 0x30A4 : 0x3044;
410 return half ? 0xFF73 : katakana ? 0x30A6 : 0x3046;
412 return half ? 0xFF74 : katakana ? 0x30A8 : 0x3048;
414 return half ? 0xFF75 : katakana ? 0x30AA : 0x304A;
420 bool IsIgnorable (int i)
422 return Uni.IsIgnorable (i) ||
423 ignoreSymbols && Uni.IsIgnorableSymbol (i) ||
424 ignoreNonSpace && Uni.IsIgnorableNonSpacing (i);
429 return i / 8 >= unsafeFlags.Length ? true : (unsafeFlags [i / 8] & (1 << (i % 8))) == 0;
434 public SortKey GetSortKey (string s)
436 return GetSortKey (s, CompareOptions.None);
439 public SortKey GetSortKey (string s, CompareOptions options)
441 return GetSortKey (s, 0, s.Length, options);
444 public SortKey GetSortKey (string s, int start, int length, CompareOptions options)
446 SetOptions (options);
448 buf.Initialize (options, lcid, s, frenchSort);
449 int end = start + length;
450 GetSortKey (s, start, end);
451 return buf.GetResultAndReset ();
454 void GetSortKey (string s, int start, int end)
456 for (int n = start; n < end; n++) {
459 ExtenderType ext = GetExtenderType (i);
460 if (ext != ExtenderType.None) {
461 i = FilterExtender (previousChar, ext);
463 FillSortKeyRaw (i, ext);
464 else if (previousSortKey != null) {
465 byte [] b = previousSortKey;
469 b [2] != 1 ? b [2] : Level2 (i, ext),
470 b [3] != 1 ? b [3] : Uni.Level3 (i));
472 // otherwise do nothing.
473 // (if the extender is the first char
474 // in the string, then just ignore.)
480 i = FilterOptions (i);
482 Contraction ct = GetContraction (s, n, end);
484 if (ct.Replacement != null) {
485 GetSortKey (ct.Replacement, 0, ct.Replacement.Length);
487 byte [] b = ct.SortKey;
491 b [2] != 1 ? b [2] : Level2 (i, ext),
492 b [3] != 1 ? b [3] : Uni.Level3 (i));
496 n += ct.Source.Length - 1;
499 if (!Uni.IsIgnorableNonSpacing (i))
501 FillSortKeyRaw (i, ExtenderType.None);
506 void FillSortKeyRaw (int i, ExtenderType ext)
508 if (0x3400 <= i && i <= 0x4DB5) {
509 int diff = i - 0x3400;
510 buf.AppendCJKExtension (
511 (byte) (0x10 + diff / 254),
512 (byte) (diff % 254 + 2));
516 UnicodeCategory uc = char.GetUnicodeCategory ((char) i);
518 case UnicodeCategory.PrivateUse:
519 int diff = i - 0xE000;
521 (byte) (0xE5 + diff / 254),
522 (byte) (diff % 254 + 2),
526 case UnicodeCategory.Surrogate:
527 FillSurrogateSortKeyRaw (i);
531 byte level2 = Level2 (i, ext);
532 if (Uni.HasSpecialWeight ((char) i)) {
533 byte level1 = Level1 (i);
539 Uni.IsJapaneseSmallLetter ((char) i),
540 ToDashTypeValue (ext),
541 !Uni.IsHiragana ((char) i),
542 IsHalfKana ((char) i)
544 if (!ignoreNonSpace && ext == ExtenderType.Voiced)
545 // Append voice weight
546 buf.AppendNormal (1, 1, 1, 0);
556 void FillSurrogateSortKeyRaw (int i)
565 lower = (byte) ((i == 0xD800) ? 0x3E : 0x3F);
566 } else if (0xD840 <= i && i < 0xD880) {
570 } else if (0xDB80 <= i && i < 0xDC00) {
571 diffbase = 0xDB80 - 0x40;
575 diffbase = 0xDC00 - 0xF8 + 2;
579 int diff = i - diffbase;
582 (byte) (segment + diff / 254),
583 (byte) (diff % 254 + 2),
592 public int Compare (string s1, string s2)
594 return Compare (s1, s2, CompareOptions.None);
597 public int Compare (string s1, string s2, CompareOptions options)
599 return Compare (s1, 0, s1.Length, s2, 0, s2.Length, options);
604 public string Source;
611 // Those instances are reused not to invoke instantiation
613 Escape escape1 = new Escape ();
614 Escape escape2 = new Escape ();
616 private int CompareOrdinal (string s1, int idx1, int len1,
617 string s2, int idx2, int len2)
619 int min = len1 < len2 ? len1 : len2;
620 int end1 = idx1 + min;
621 int end2 = idx2 + min;
622 for (int i1 = idx1, i2 = idx2;
623 i1 < end1 && i2 < end2; i1++, i2++)
624 if (s1 [i1] != s2 [i2])
625 return s1 [i1] - s2 [i2];
626 return len1 == len2 ? 0 :
627 len1 == min ? - 1 : 1;
630 public int Compare (string s1, int idx1, int len1,
631 string s2, int idx2, int len2, CompareOptions options)
633 // quick equality check
634 if (idx1 == idx2 && len1 == len2 &&
635 Object.ReferenceEquals (s1, s2))
637 // FIXME: this should be done inside Compare() at
639 // int ord = CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
642 if (options == CompareOptions.Ordinal)
643 return CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
645 #if false // stable easy version, depends on GetSortKey().
646 SortKey sk1 = GetSortKey (s1, idx1, len1, options);
647 SortKey sk2 = GetSortKey (s2, idx2, len2, options);
648 byte [] d1 = sk1.KeyData;
649 byte [] d2 = sk2.KeyData;
650 int len = d1.Length > d2.Length ? d2.Length : d1.Length;
651 for (int i = 0; i < len; i++)
652 if (d1 [i] != d2 [i])
653 return d1 [i] < d2 [i] ? -1 : 1;
654 return d1.Length == d2.Length ? 0 : d1.Length < d2.Length ? -1 : 1;
656 SetOptions (options);
658 int ret = CompareInternal (s1, idx1, len1, s2, idx2, len2, (options & CompareOptions.StringSort) != 0, out dummy, out dummy2, true);
659 return ret == 0 ? 0 : ret < 0 ? -1 : 1;
663 int CompareInternal (string s1, int idx1, int len1, string s2,
664 int idx2, int len2, bool stringSort,
665 out bool targetConsumed, out bool sourceConsumed,
666 bool skipHeadingExtenders)
670 int end1 = idx1 + len1;
671 int end2 = idx2 + len2;
672 targetConsumed = false;
673 sourceConsumed = false;
675 // It holds final result that comes from the comparison
676 // at level 2 or lower. Even if Compare() found the
677 // difference at level 2 or lower, it still has to
678 // continue level 1 comparison. FinalResult is used
679 // when there was no further differences.
681 // It holds the comparison level to do. It starts from
682 // 5, and becomes 1 when we do primary-only comparison.
683 int currentLevel = 5;
690 // Skip heading extenders
691 if (skipHeadingExtenders) {
692 for (; idx1 < end1; idx1++)
693 if (GetExtenderType (s1 [idx1]) == ExtenderType.None)
695 for (; idx2 < end2; idx2++)
696 if (GetExtenderType (s2 [idx2]) == ExtenderType.None)
700 ExtenderType ext1 = ExtenderType.None;
701 ExtenderType ext2 = ExtenderType.None;
703 int quickCheckPos1 = idx1;
704 int quickCheckPos2 = idx2;
707 for (; idx1 < end1; idx1++)
708 if (!IsIgnorable (s1 [idx1]))
710 for (; idx2 < end2; idx2++)
711 if (!IsIgnorable (s2 [idx2]))
715 if (escape1.Source == null)
718 start1 = escape1.Start;
719 idx1 = escape1.Index;
721 quickCheckPos1 = escape1.Optional;
722 escape1.Source = null;
726 if (escape2.Source == null)
729 start2 = escape2.Start;
730 idx2 = escape2.Index;
732 quickCheckPos2 = escape2.Optional;
733 escape2.Source = null;
737 // If comparison is unstable, then this part is one of the most doubtful part.
738 // Here we run quick codepoint comparison and run back to "stable" area to
739 // compare characters.
741 // Strictly to say, even the first character
742 // could be compared here, but it messed
743 // backward step, so I just avoided mess.
744 if (quickCheckPos1 < idx1 && quickCheckPos2 < idx2) {
745 while (idx1 < end1 && idx2 < end2 &&
746 s1 [idx1] == s2 [idx2]) {
750 if (idx1 == end1 || idx2 == end2)
751 continue; // check replacement
753 int backwardEnd1 = quickCheckPos1;
754 int backwardEnd2 = quickCheckPos2;
755 quickCheckPos1 = idx1;
756 quickCheckPos2 = idx2;
761 for (;idx1 > backwardEnd1; idx1--)
762 if (Category (s1 [idx1]) != 1)
764 for (;idx2 > backwardEnd2; idx2--)
765 if (Category (s2 [idx2]) != 1)
767 for (;idx1 > backwardEnd1; idx1--)
768 if (IsSafe (s1 [idx1]))
770 for (;idx2 > backwardEnd2; idx2--)
771 if (IsSafe (s2 [idx2]))
780 int i1 = FilterOptions (s1 [idx1]);
781 int i2 = FilterOptions (s2 [idx2]);
782 bool special1 = false;
783 bool special2 = false;
785 // If current character is an expander, then
786 // repeat the previous character.
787 ext1 = GetExtenderType (i1);
788 if (ext1 != ExtenderType.None) {
789 if (previousChar < 0) {
790 if (previousSortKey == null) {
795 sk1 = previousSortKey;
798 i1 = FilterExtender (previousChar, ext1);
800 ext2 = GetExtenderType (i2);
801 if (ext2 != ExtenderType.None) {
802 if (previousChar2 < 0) {
803 if (previousSortKey2 == null) {
808 sk2 = previousSortKey2;
811 i2 = FilterExtender (previousChar2, ext2);
814 byte cat1 = Category (i1);
815 byte cat2 = Category (i2);
817 // Handle special weight characters
818 if (!stringSort && currentLevel > 4) {
820 lv5At1 = escape1.Source != null ?
821 escape1.Index - escape1.Start :
823 // here Windows has a bug that it does
824 // not consider thirtiary weight.
825 lv5Value1 = Level1 (i1) << 8 + Uni.Level3 (i1);
830 lv5At2 = escape2.Source != null ?
831 escape2.Index - escape2.Start :
833 // here Windows has a bug that it does
834 // not consider thirtiary weight.
835 lv5Value2 = Level1 (i2) << 8 + Uni.Level3 (i2);
839 if (cat1 == 6 || cat2 == 6) {
845 Contraction ct1 = null;
846 if (ext1 == ExtenderType.None)
847 ct1 = GetContraction (s1, idx1, end1);
852 else if (ct1 != null) {
853 offset1 = ct1.Source.Length;
854 if (ct1.SortKey != null) {
856 for (int i = 0; i < ct1.SortKey.Length; i++)
857 sk1 [i] = ct1.SortKey [i];
859 previousSortKey = sk1;
861 else if (escape1.Source == null) {
863 escape1.Start = start1;
864 escape1.Index = cur1 + ct1.Source.Length;
866 escape1.Optional = quickCheckPos1;
867 s1 = ct1.Replacement;
878 sk1 [1] = Level1 (i1);
879 if (!ignoreNonSpace && currentLevel > 1)
880 sk1 [2] = Level2 (i1, ext1);
881 if (currentLevel > 2)
882 sk1 [3] = Uni.Level3 (i1);
883 if (currentLevel > 3)
884 special1 = Uni.HasSpecialWeight ((char) i1);
889 Contraction ct2 = null;
890 if (ext2 == ExtenderType.None)
891 ct2 = GetContraction (s2, idx2, end2);
895 else if (ct2 != null) {
896 idx2 += ct2.Source.Length;
897 if (ct2.SortKey != null) {
899 for (int i = 0; i < ct2.SortKey.Length; i++)
900 sk2 [i] = ct2.SortKey [i];
902 previousSortKey2 = sk2;
904 else if (escape2.Source == null) {
906 escape2.Start = start2;
907 escape2.Index = cur2 + ct2.Source.Length;
909 escape2.Optional = quickCheckPos2;
910 s2 = ct2.Replacement;
921 sk2 [1] = Level1 (i2);
922 if (!ignoreNonSpace && currentLevel > 1)
923 sk2 [2] = Level2 (i2, ext2);
924 if (currentLevel > 2)
925 sk2 [3] = Uni.Level3 (i2);
926 if (currentLevel > 3)
927 special2 = Uni.HasSpecialWeight ((char) i2);
933 // Add offset here so that it does not skip
934 // idx1 while s2 has a replacement.
937 // add diacritical marks in s1 here
938 if (!ignoreNonSpace) {
939 while (idx1 < end1) {
940 if (Category (s1 [idx1]) != 1)
944 sk1 [2] = (byte) (sk1 [2] +
945 Level2 (s1 [idx1], ExtenderType.None));
949 // add diacritical marks in s2 here
950 while (idx2 < end2) {
951 if (Category (s2 [idx2]) != 1)
955 sk2 [2] = (byte) (sk2 [2] +
956 Level2 (s2 [idx2], ExtenderType.None));
961 int ret = sk1 [0] - sk2 [0];
962 ret = ret != 0 ? ret : sk1 [1] - sk2 [1];
965 if (currentLevel == 1)
967 if (!ignoreNonSpace) {
968 ret = sk1 [2] - sk2 [2];
971 currentLevel = frenchSort ? 2 : 1;
975 if (currentLevel == 2)
977 ret = sk1 [3] - sk2 [3];
983 if (currentLevel == 3)
985 if (special1 != special2) {
986 finalResult = special1 ? 1 : -1;
991 ret = CompareFlagPair (
992 !Uni.IsJapaneseSmallLetter ((char) i1),
993 !Uni.IsJapaneseSmallLetter ((char) i2));
994 ret = ret != 0 ? ret :
995 ToDashTypeValue (ext1) -
996 ToDashTypeValue (ext2);
997 ret = ret != 0 ? ret : CompareFlagPair (
998 Uni.IsHiragana ((char) i1),
999 Uni.IsHiragana ((char) i2));
1000 ret = ret != 0 ? ret : CompareFlagPair (
1001 !IsHalfKana ((char) i1),
1002 !IsHalfKana ((char) i2));
1011 // If there were only level 3 or lower differences,
1012 // then we still have to find diacritical differences
1014 if (!ignoreNonSpace &&
1015 finalResult != 0 && currentLevel > 2) {
1016 while (idx1 < end1 && idx2 < end2) {
1017 if (!Uni.IsIgnorableNonSpacing (s1 [idx1]))
1019 if (!Uni.IsIgnorableNonSpacing (s2 [idx2]))
1021 finalResult = Level2 (FilterOptions ((s1 [idx1])), ext1) - Level2 (FilterOptions (s2 [idx2]), ext2);
1022 if (finalResult != 0)
1026 // they should work only for the first character
1027 ext1 = ExtenderType.None;
1028 ext2 = ExtenderType.None;
1031 if (currentLevel == 1 && finalResult != 0) {
1033 if (Uni.IsIgnorableNonSpacing (s1 [idx1]))
1036 if (Uni.IsIgnorableNonSpacing (s2 [idx2]))
1039 // we still have to handle level 5
1040 if (finalResult == 0) {
1041 finalResult = lv5At1 - lv5At2;
1042 if (finalResult == 0)
1043 finalResult = lv5Value1 - lv5Value2;
1045 if (finalResult == 0) {
1047 targetConsumed = true;
1049 sourceConsumed = true;
1051 return idx1 != end1 ? 1 : idx2 == end2 ? finalResult : -1;
1054 int CompareFlagPair (bool b1, bool b2)
1056 return b1 == b2 ? 0 : b1 ? 1 : -1;
1061 #region IsPrefix() and IsSuffix()
1063 public bool IsPrefix (string src, string target, CompareOptions opt)
1065 return IsPrefix (src, target, 0, src.Length, opt);
1068 public bool IsPrefix (string s, string target, int start, int length, CompareOptions opt)
1071 return IsPrefix (s, target, start, length,
1072 (opt & CompareOptions.StringSort) != 0, true);
1075 public bool IsPrefix (string s, string target, int start, int length, bool stringSort, bool skipHeadingExtenders)
1077 bool consumed, dummy;
1078 int ret = CompareInternal (s, start, length,
1079 target, 0, target.Length, stringSort,
1080 out consumed, out dummy, skipHeadingExtenders);
1086 public bool IsSuffix (string src, string target, CompareOptions opt)
1088 return IsSuffix (src, target, src.Length - 1, src.Length, opt);
1091 public bool IsSuffix (string s, string target, int start, int length, CompareOptions opt)
1093 // quick check : simple codepoint comparison
1094 if (s.Length >= target.Length) {
1096 int se = start - length;
1097 for (int i = target.Length - 1; si >= se && i >= 0; i--, si--)
1098 if (s [si] != target [i])
1100 if (si == start + target.Length)
1105 return IsSuffix (s, target, start, length,
1106 (opt & CompareOptions.StringSort) != 0);
1109 bool IsSuffix (string s, string t, int start, int length,
1113 for (;tstart < t.Length; tstart++)
1114 if (!IsIgnorable (t [tstart]))
1116 if (tstart == t.Length)
1117 return true; // as if target is String.Empty.
1120 // This is still not working. If it is not likely to get working, then just remove it.
1122 int send = start - length;
1123 int ti = t.Length - 1;
1126 int sStep = start + 1;
1127 int tStep = t.Length;
1128 bool sourceConsumed, targetConsumed;
1130 for (; send < si; si--)
1131 if (!IsIgnorable (s [si]))
1133 for (; tend < ti; ti--)
1134 if (!IsIgnorable (t [ti]))
1138 for (; send < si; si--)
1139 if (IsSafe (s [si]))
1141 for (; tend < ti; ti--)
1142 if (IsSafe (t [ti]))
1144 Console.WriteLine ("==== {0} {1} {2} {3} {4} {5} {6} {7} {8}", s, si, send, length, t, ti, tstart, sStep - si, tStep - ti);
1145 if (CompareInternal (s, si, sStep - si,
1146 t, ti, tStep - ti, stringSort,
1147 out targetConsumed, out sourceConsumed,
1159 // FIXME: it is not efficient for very long target.
1160 bool sourceConsumed, targetConsumed;
1161 int mismatchCount = 0;
1162 for (int i = 0; i < length; i++) {
1163 escape1.Source = escape2.Source = null;
1164 previousSortKey = previousSortKey2 = null;
1165 previousChar = previousChar2 = -1;
1167 int ret = CompareInternal (s, start - i, i + 1,
1168 t, tstart, t.Length - tstart,
1169 stringSort, out targetConsumed,
1170 out sourceConsumed, true);
1173 if (!sourceConsumed && targetConsumed)
1175 if (!targetConsumed) {
1177 if (mismatchCount > Uni.MaxExpansionLength)
1178 // The largest length of an
1179 // expansion is 3, so if the
1180 // target was not consumed more
1181 // than 3 times, then the tail
1182 // character does not match.
1192 #region IndexOf() / LastIndexOf()
1194 public int IndexOf (string s, string target, CompareOptions opt)
1196 return IndexOf (s, target, 0, s.Length, opt);
1199 public int IndexOf (string s, string target, int start, int length, CompareOptions opt)
1202 return IndexOf (s, target, start, length,
1203 (opt & CompareOptions.StringSort) != 0);
1206 public int IndexOf (string s, char target, CompareOptions opt)
1208 return IndexOf (s, target, 0, s.Length, opt);
1211 public int IndexOf (string s, char target, int start, int length, CompareOptions opt)
1215 // If target is contraction, then use string search.
1216 Contraction ct = GetContraction (target);
1218 if (ct.Replacement != null)
1219 return IndexOf (s, ct.Replacement, start, length,
1220 (opt & CompareOptions.StringSort) != 0);
1222 return IndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true);
1224 int ti = FilterOptions ((int) target);
1225 charSortKeyIndexTarget [0] = Category (ti);
1226 charSortKeyIndexTarget [1] = Level1 (ti);
1227 if (!ignoreNonSpace)
1228 charSortKeyIndexTarget [2] =
1229 Level2 (ti, ExtenderType.None);
1230 charSortKeyIndexTarget [3] = Uni.Level3 (ti);
1231 return IndexOfSortKey (s, start, length,
1232 charSortKeyIndexTarget, target, ti,
1233 !Uni.HasSpecialWeight ((char) ti));
1237 // Searches target byte[] keydata
1238 int IndexOfSortKey (string s, int start, int length, byte [] sortkey, char target, int ti, bool noLv4)
1240 int end = start + length;
1244 if (MatchesForward (s, ref idx, end, ti, sortkey, noLv4))
1250 // Searches string. Search head character (or keydata when
1251 // the head is contraction sortkey) and try IsPrefix().
1252 int IndexOf (string s, string target, int start, int length, bool stringSort)
1255 for (; tidx < target.Length; tidx++)
1256 if (!IsIgnorable (target [tidx]))
1258 if (tidx == target.Length)
1260 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1261 string replace = ct != null ? ct.Replacement : null;
1262 byte [] sk = replace == null ? charSortKeyIndexTarget : null;
1264 char tc = char.MinValue;
1266 if (ct != null && sk != null) {
1267 for (int i = 0; i < ct.SortKey.Length; i++)
1268 sk [i] = ct.SortKey [i];
1269 } else if (sk != null) {
1271 ti = FilterOptions (target [tidx]);
1272 sk [0] = Category (ti);
1273 sk [1] = Level1 (ti);
1274 if (!ignoreNonSpace)
1275 sk [2] = Level2 (ti, ExtenderType.None);
1276 sk [3] = Uni.Level3 (ti);
1277 noLv4 = !Uni.HasSpecialWeight ((char) ti);
1280 for (tidx++; tidx < target.Length; tidx++) {
1281 if (Category (target [tidx]) != 1)
1285 sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None));
1291 if (replace != null)
1292 idx = IndexOf (s, replace, start, length, stringSort);
1294 idx = IndexOfSortKey (s, start, length, sk, tc, ti, noLv4);
1297 length -= idx - start;
1299 if (IsPrefix (s, target, start, length, stringSort, false))
1301 Contraction cts = GetContraction (s, start, length);
1303 start += cts.Source.Length;
1304 length -= cts.Source.Length;
1309 } while (length > 0);
1314 // There are the same number of IndexOf() related methods,
1315 // with the same functionalities for each.
1318 public int LastIndexOf (string s, string target, CompareOptions opt)
1320 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1323 public int LastIndexOf (string s, string target, int start, int length, CompareOptions opt)
1326 return LastIndexOf (s, target, start, length,
1327 (opt & CompareOptions.StringSort) != 0);
1330 public int LastIndexOf (string s, char target, CompareOptions opt)
1332 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1335 public int LastIndexOf (string s, char target, int start, int length, CompareOptions opt)
1339 // If target is contraction, then use string search.
1340 Contraction ct = GetContraction (target);
1342 if (ct.Replacement != null)
1343 return LastIndexOf (s, ct.Replacement, start, length, (opt & CompareOptions.StringSort) != 0);
1345 return LastIndexOfSortKey (s, start, start, length, ct.SortKey, char.MinValue, -1, true);
1348 int ti = FilterOptions ((int) target);
1349 charSortKeyIndexTarget [0] = Category (ti);
1350 charSortKeyIndexTarget [1] = Level1 (ti);
1351 if (!ignoreNonSpace)
1352 charSortKeyIndexTarget [2] = Level2 (ti, ExtenderType.None);
1353 charSortKeyIndexTarget [3] = Uni.Level3 (ti);
1354 return LastIndexOfSortKey (s, start, start, length, charSortKeyIndexTarget, target, ti, !Uni.HasSpecialWeight ((char) ti));
1358 // Searches target byte[] keydata
1359 int LastIndexOfSortKey (string s, int start, int orgStart, int length, byte [] sortkey, char target, int ti, bool noLv4)
1361 int end = start - length;
1365 if (MatchesBackward (s, ref idx, end, orgStart, ti, sortkey, noLv4))
1371 // Searches string. Search head character (or keydata when
1372 // the head is contraction sortkey) and try IsPrefix().
1373 int LastIndexOf (string s, string target, int start, int length, bool stringSort)
1375 int orgStart = start;
1377 for (; tidx < target.Length; tidx++)
1378 if (!IsIgnorable (target [tidx]))
1380 if (tidx == target.Length)
1382 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1383 string replace = ct != null ? ct.Replacement : null;
1384 byte [] sk = replace == null ? charSortKeyIndexTarget : null;
1387 char tc = char.MinValue;
1389 if (ct != null && sk != null) {
1390 for (int i = 0; i < ct.SortKey.Length; i++)
1391 sk [i] = ct.SortKey [i];
1392 } else if (sk != null) {
1394 ti = FilterOptions (target [tidx]);
1395 sk [0] = Category (ti);
1396 sk [1] = Level1 (ti);
1397 if (!ignoreNonSpace)
1398 sk [2] = Level2 (ti, ExtenderType.None);
1399 sk [3] = Uni.Level3 (ti);
1400 noLv4 = !Uni.HasSpecialWeight ((char) ti);
1403 for (tidx++; tidx < target.Length; tidx++) {
1404 if (Category (target [tidx]) != 1)
1408 sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None));
1415 if (replace != null)
1416 idx = LastIndexOf (s, replace, start, length, stringSort);
1418 idx = LastIndexOfSortKey (s, start, orgStart, length, sk, tc, ti, noLv4);
1421 length -= start - idx;
1424 if (IsPrefix (s, target, idx, orgStart - idx + 1, stringSort, false)) {
1425 for (;idx < orgStart; idx++)
1426 if (!IsIgnorable (s [idx]))
1430 Contraction cts = GetContraction (s, idx, orgStart - idx + 1);
1432 start -= cts.Source.Length;
1433 length -= cts.Source.Length;
1438 } while (length > 0);
1442 private bool MatchesForward (string s, ref int idx, int end, int ti, byte [] sortkey, bool noLv4)
1445 ExtenderType ext = GetExtenderType (s [idx]);
1446 Contraction ct = null;
1447 if (ext == ExtenderType.None)
1448 ct = GetContraction (s, idx, end);
1449 else if (previousChar < 0) {
1450 if (previousSortKey == null) {
1454 charSortKey = previousSortKey;
1457 si = FilterExtender (previousChar, ext);
1458 // if lv4 exists, it never matches contraction
1460 idx += ct.Source.Length;
1463 if (ct.SortKey != null) {
1464 for (int i = 0; i < sortkey.Length; i++)
1465 charSortKey [i] = sortkey [i];
1467 previousSortKey = charSortKey;
1469 // Here is the core of LAMESPEC
1470 // described at the top of the source.
1472 return MatchesForward (ct.Replacement, ref dummy,
1473 ct.Replacement.Length, ti, sortkey, noLv4);
1477 si = FilterOptions (s [idx]);
1479 charSortKey [0] = Category (si);
1480 bool noMatch = false;
1481 if (sortkey [0] == charSortKey [0])
1482 charSortKey [1] = Level1 (si);
1485 if (!ignoreNonSpace && sortkey [1] == charSortKey [1])
1486 charSortKey [2] = Level2 (si, ext);
1487 else if (!ignoreNonSpace)
1490 for (; idx < end; idx++) {
1491 if (Category (s [idx]) != 1)
1496 charSortKey [3] = Uni.Level3 (si);
1497 if (charSortKey [0] != 1)
1500 for (; idx < end; idx++) {
1501 if (Category (s [idx]) != 1)
1505 if (charSortKey [2] == 0)
1506 charSortKey [2] = 2;
1507 charSortKey [2] = (byte) (charSortKey [2]
1508 + Level2 (s [idx], ExtenderType.None));
1511 return MatchesPrimitive (charSortKey, si, ext, sortkey, ti, noLv4);
1514 private bool MatchesPrimitive (byte [] charSortKey, int si, ExtenderType ext, byte [] sortkey, int ti, bool noLv4)
1516 if (charSortKey [0] != sortkey [0] ||
1517 charSortKey [1] != sortkey [1] ||
1518 (!ignoreNonSpace && charSortKey [2] != sortkey [2]) ||
1519 charSortKey [3] != sortkey [3])
1521 if (noLv4 && (si < 0 || !Uni.HasSpecialWeight ((char) si)))
1525 // Since target can never be an extender, if the source
1526 // is an expander and it matters, then they never match.
1527 if (!ignoreNonSpace && ext == ExtenderType.Conditional)
1529 if (Uni.IsJapaneseSmallLetter ((char) si) !=
1530 Uni.IsJapaneseSmallLetter ((char) ti) ||
1531 ToDashTypeValue (ext) !=
1532 // FIXME: we will have to specify correct value for target
1533 ToDashTypeValue (ExtenderType.None) ||
1534 !Uni.IsHiragana ((char) si) !=
1535 !Uni.IsHiragana ((char) ti) ||
1536 IsHalfKana ((char) si) !=
1537 IsHalfKana ((char) ti))
1542 private bool MatchesBackward (string s, ref int idx, int end, int orgStart, int ti, byte [] sortkey, bool noLv4)
1546 ExtenderType ext = GetExtenderType (s [idx]);
1547 // To handle extenders in source, we need to
1548 // check next _primary_ character.
1549 if (ext != ExtenderType.None) {
1550 byte diacritical = 0;
1551 for (int tmp = 0; ; tmp--) {
1552 if (tmp < 0) // heading extender
1554 if (IsIgnorable (s [tmp]))
1556 int tmpi = FilterOptions (s [tmp]);
1557 byte category = Category (tmpi);
1558 if (category == 1) {
1559 diacritical = Level2 (tmpi, ExtenderType.None);
1562 si = FilterExtender (tmpi, ext);
1563 charSortKey [0] = category;
1564 charSortKey [1] = Level1 (si);
1565 if (!ignoreNonSpace)
1566 charSortKey [2] = Level2 (si, ext);
1567 charSortKey [3] = Uni.Level3 (si);
1568 if (ext != ExtenderType.Conditional &&
1571 (charSortKey [2] == 0) ?
1572 (byte) (diacritical + 2) :
1578 Contraction ct = null;
1579 if (ext == ExtenderType.None)
1580 ct = GetContraction (s, idx, end);
1581 // if lv4 exists, it never matches contraction
1583 idx -= ct.Source.Length;
1586 if (ct.SortKey != null) {
1587 for (int i = 0; i < sortkey.Length; i++)
1588 charSortKey [i] = sortkey [i];
1590 previousSortKey = charSortKey;
1592 // Here is the core of LAMESPEC
1593 // described at the top of the source.
1594 int dummy = ct.Replacement.Length - 1;
1595 return MatchesBackward (ct.Replacement,
1596 ref dummy, dummy, -1, ti, sortkey, noLv4);
1598 } else if (ext == ExtenderType.None) {
1600 si = FilterOptions (s [idx]);
1602 bool noMatch = false;
1603 charSortKey [0] = Category (si);
1604 if (charSortKey [0] == sortkey [0])
1605 charSortKey [1] = Level1 (si);
1608 if (!ignoreNonSpace && charSortKey [1] == sortkey [1])
1609 charSortKey [2] = Level2 (si, ext);
1610 else if (!ignoreNonSpace)
1614 charSortKey [3] = Uni.Level3 (si);
1615 if (charSortKey [0] != 1)
1618 if (ext == ExtenderType.None) {
1619 for (int tmp = cur + 1; tmp < orgStart; tmp++) {
1620 if (Category (s [tmp]) != 1)
1624 if (charSortKey [2] == 0)
1625 charSortKey [2] = 2;
1626 charSortKey [2] = (byte) (charSortKey [2]
1627 + Level2 (s [tmp], ExtenderType.None));
1630 return MatchesPrimitive (charSortKey, si, ext, sortkey, ti, noLv4);