4 // This class will demonstrate CompareInfo functionality that will just work.
8 // Here's a summary for supporting contractions, expansions and diacritical
11 // Diacritical mapping is a simple tailoring rule that "remaps" diacritical
12 // weight value from one to another. For now it applies to all range of
13 // characters, but at some stage we might need to limit the remapping ranges.
15 // A Contraction consists of a string (actually char[]) and a sortkey entry
16 // (i.e. byte[]). It indicates that a sequence of characters is interpreted
17 // as a single character that has the mapped sortkey value. There is no
18 // character which goes across "special rules". When the collator encountered
19 // such a sequence of characters, it just returns the sortkey value without
20 // further replacement.
22 // Since it is still likely to happen that a contraction sequence matches
23 // other character than the identical sequence (depending on CompareOptions
24 // and of course, defined sortkey value itself), comparison cannot be done
25 // at source char[] level.
27 // In IndexOf(), the first character in the target string or the target char
28 // itself is turned into sortkey bytes. If the character has a contraction and
29 // that is sortkey map, then it is used instead. If the contraction exists and
30 // that is replacement map, then the first character of the replacement string
31 // is searched instead. IndexOf() always searches only for the top character,
32 // and if it returned negative value, then it returns -1. Otherwise, it then
33 // tries IsPrefix() from that location. If it returns true, then it returns
36 // LAMESPEC: IndexOf() is lame as a whole API. It never matches in the middle
37 // of expansion and there is no proper way to return such indexes within
38 // a single int return value.
40 // For example, try below in .NET:
41 // IndexOf("\u00E6", "a")
42 // IndexOf("\u00E6", "e")
47 using System.Collections;
48 using System.Globalization;
50 using Uni = Mono.Globalization.Unicode.MSCompatUnicodeTable;
51 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
53 namespace Mono.Globalization.Unicode
55 internal class SimpleCollator
57 static SimpleCollator invariant =
58 new SimpleCollator (CultureInfo.InvariantCulture);
61 // CompareOptions expanded.
62 bool ignoreNonSpace; // used in IndexOf()
67 TextInfo textInfo; // for ToLower().
69 unsafe readonly byte* cjkCatTable;
70 unsafe readonly byte* cjkLv1Table;
71 readonly CodePointIndexer cjkIndexer;
72 unsafe readonly byte* cjkLv2Table;
73 readonly CodePointIndexer cjkLv2Indexer;
75 readonly Contraction [] contractions;
76 readonly Level2Map [] level2Maps;
78 // This flag marks characters as "unsafe", where the character
79 // could be used as part of a contraction (whose length > 1).
80 readonly bool [] unsafeFlags;
82 const int UnsafeFlagLength = 0x300;
84 // temporary sortkey buffer for index search/comparison
85 byte [] charSortKey = new byte [4];
86 byte [] charSortKey2 = new byte [4];
87 byte [] charSortKeyIndexTarget = new byte [4];
89 #region .ctor() and split functions
91 public SimpleCollator (CultureInfo culture)
94 textInfo = culture.TextInfo;
95 buf = new SortKeyBuffer (culture.LCID);
98 SetCJKTable (culture, ref cjkIndexer,
99 ref cjkCatTable, ref cjkLv1Table,
100 ref cjkLv2Indexer, ref cjkLv2Table);
103 // Get tailoring info
104 TailoringInfo t = null;
105 for (CultureInfo ci = culture; ci.LCID != 127; ci = ci.Parent) {
106 t = Uni.GetTailoringInfo (ci.LCID);
110 if (t == null) // then use invariant
111 t = Uni.GetTailoringInfo (127);
113 frenchSort = t.FrenchSort;
114 Uni.BuildTailoringTables (culture, t, ref contractions,
116 unsafeFlags = new bool [UnsafeFlagLength];
117 foreach (Contraction c in contractions)
118 if (c.Source.Length > 1)
119 foreach (char ch in c.Source)
120 unsafeFlags [(int) ch] = true;
122 // FIXME: Since tailorings are mostly for latin
123 // (and in some cases Cyrillic) characters, it would
124 // be much better for performance to store "start
125 // indexes" for > 370 (culture-specific letters).
128 // dump tailoring table
129 Console.WriteLine ("******** building table for {0} : c - {1} d - {2}",
130 culture.LCID, contractions.Length, level2Maps.Length);
131 foreach (Contraction c in contractions) {
132 foreach (char cc in c.Source)
133 Console.Write ("{0:X4} ", (int) cc);
134 Console.WriteLine (" -> '{0}'", c.Replacement);
139 unsafe private void SetCJKTable (
140 CultureInfo culture, ref CodePointIndexer cjkIndexer,
141 ref byte* catTable, ref byte* lv1Table,
142 ref CodePointIndexer lv2Indexer, ref byte* lv2Table)
144 string name = GetNeutralCulture (culture).Name;
146 Uni.FillCJK (name, ref cjkIndexer, ref catTable,
147 ref lv1Table, ref lv2Indexer, ref lv2Table);
150 static CultureInfo GetNeutralCulture (CultureInfo info)
152 CultureInfo ret = info;
153 while (ret.Parent != null && ret.Parent.LCID != 127)
160 unsafe byte Category (int cp)
162 if (cp < 0x3000 || cjkCatTable == null)
163 return Uni.Category (cp);
164 int idx = cjkIndexer.ToIndex (cp);
165 return idx < 0 ? Uni.Category (cp) : cjkCatTable [idx];
168 unsafe byte Level1 (int cp)
170 if (cp < 0x3000 || cjkLv1Table == null)
171 return Uni.Level1 (cp);
172 int idx = cjkIndexer.ToIndex (cp);
173 return idx < 0 ? Uni.Level1 (cp) : cjkLv1Table [idx];
176 unsafe byte Level2 (int cp, ExtenderType ext)
178 if (ext == ExtenderType.Buggy)
180 else if (ext == ExtenderType.Conditional)
183 if (cp < 0x3000 || cjkLv2Table == null)
184 return Uni.Level2 (cp);
185 int idx = cjkLv2Indexer.ToIndex (cp);
186 byte ret = idx < 0 ? (byte) 0 : cjkLv2Table [idx];
189 ret = Uni.Level2 (cp);
190 if (level2Maps.Length == 0)
192 for (int i = 0; i < level2Maps.Length; i++) {
193 if (level2Maps [i].Source == ret)
194 return level2Maps [i].Replace;
195 else if (level2Maps [i].Source > ret)
201 bool IsHalfKana (int cp)
203 return ignoreWidth || Uni.IsHalfWidthKana ((char) cp);
206 void SetOptions (CompareOptions options)
208 this.ignoreNonSpace = (options & CompareOptions.IgnoreNonSpace) != 0;
209 this.ignoreSymbols = (options & CompareOptions.IgnoreSymbols) != 0;
210 this.ignoreWidth = (options & CompareOptions.IgnoreWidth) != 0;
211 this.ignoreCase = (options & CompareOptions.IgnoreCase) != 0;
212 this.ignoreKanaType = (options & CompareOptions.IgnoreKanaType) != 0;
213 previousChar = previousChar2 = -1;
214 previousSortKey = previousSortKey2 = null;
215 escape1.Source = escape2.Source = null;
218 Contraction GetContraction (string s, int start, int end)
220 Contraction c = GetContraction (s, start, end, contractions);
221 if (c != null || lcid == 127)
223 return GetContraction (s, start, end, invariant.contractions);
226 Contraction GetContraction (string s, int start, int end, Contraction [] clist)
228 for (int i = 0; i < clist.Length; i++) {
229 Contraction ct = clist [i];
230 int diff = ct.Source [0] - s [start];
232 return null; // it's already sorted
235 char [] chars = ct.Source;
236 if (end - start < chars.Length)
239 for (int n = 0; n < chars.Length; n++)
240 if (s [start + n] != chars [n]) {
250 Contraction GetTailContraction (string s, int start, int end)
252 Contraction c = GetTailContraction (s, start, end, contractions);
253 if (c != null || lcid == 127)
255 return GetTailContraction (s, start, end, invariant.contractions);
258 Contraction GetTailContraction (string s, int start, int end, Contraction [] clist)
260 for (int i = 0; i < clist.Length; i++) {
261 Contraction ct = clist [i];
262 int diff = ct.Source [0] - s [end];
264 return null; // it's already sorted
267 char [] chars = ct.Source;
268 if (start - end + 1 < chars.Length)
271 int offset = start - chars.Length + 1;
272 for (int n = 0; n < chars.Length; n++)
273 if (s [offset + n] != chars [n]) {
283 Contraction GetContraction (char c)
285 Contraction ct = GetContraction (c, contractions);
286 if (ct != null || lcid == 127)
288 return GetContraction (c, invariant.contractions);
291 Contraction GetContraction (char c, Contraction [] clist)
293 for (int i = 0; i < clist.Length; i++) {
294 Contraction ct = clist [i];
295 if (ct.Source [0] > c)
296 return null; // it's already sorted
297 if (ct.Source [0] == c && ct.Source.Length == 1)
303 int FilterOptions (int i)
306 int x = Uni.ToWidthCompat (i);
311 i = textInfo.ToLower ((char) i);
313 i = Uni.ToKanaTypeInsensitive (i);
317 int previousChar = -1;
318 byte [] previousSortKey = null;
319 int previousChar2 = -1;
320 byte [] previousSortKey2 = null;
330 ExtenderType GetExtenderType (int i)
332 // LAMESPEC: Windows expects true for U+3005, but
333 // sometimes it does not represent to repeat just
335 // Windows also expects true for U+3031 and U+3032,
336 // but they should *never* repeat one character.
338 // U+2015 becomes an extender only when it is Japanese
340 return lcid == 16 ? ExtenderType.Conditional :
343 if (i < 0x3005 || i > 0xFF70)
344 return ExtenderType.None;
349 return ExtenderType.Simple;
351 return ExtenderType.Conditional;
354 return ExtenderType.Voiced;
358 return ExtenderType.None;
360 case 0x3005: // LAMESPEC: see above.
361 return ExtenderType.Buggy;
362 case 0x3031: // LAMESPEC: see above.
363 case 0x3032: // LAMESPEC: see above.
366 return ExtenderType.Simple;
369 return ExtenderType.Voiced;
371 return ExtenderType.Conditional;
373 return ExtenderType.None;
377 byte ToDashTypeValue (ExtenderType ext)
379 if (ignoreNonSpace) // LAMESPEC: huh, why?
382 case ExtenderType.None:
384 case ExtenderType.Conditional:
391 int FilterExtender (int i, ExtenderType ext)
393 if (ext == ExtenderType.Conditional &&
394 Uni.HasSpecialWeight ((char) i)) {
395 bool half = IsHalfKana ((char) i);
396 bool katakana = !Uni.IsHiragana ((char) i);
397 switch (Level1 (i) & 7) {
399 return half ? 0xFF71 : katakana ? 0x30A2 : 0x3042;
401 return half ? 0xFF72 : katakana ? 0x30A4 : 0x3044;
403 return half ? 0xFF73 : katakana ? 0x30A6 : 0x3046;
405 return half ? 0xFF74 : katakana ? 0x30A8 : 0x3048;
407 return half ? 0xFF75 : katakana ? 0x30AA : 0x304A;
413 bool IsIgnorable (int i)
415 return Uni.IsIgnorable (i) ||
416 ignoreSymbols && Uni.IsIgnorableSymbol (i) ||
417 ignoreNonSpace && Uni.IsIgnorableNonSpacing (i);
422 return i >= unsafeFlags.Length ? true : !unsafeFlags [i];
427 public SortKey GetSortKey (string s)
429 return GetSortKey (s, CompareOptions.None);
432 public SortKey GetSortKey (string s, CompareOptions options)
434 return GetSortKey (s, 0, s.Length, options);
437 public SortKey GetSortKey (string s, int start, int length, CompareOptions options)
439 SetOptions (options);
441 buf.Initialize (options, lcid, s, frenchSort);
442 int end = start + length;
443 GetSortKey (s, start, end);
444 return buf.GetResultAndReset ();
447 void GetSortKey (string s, int start, int end)
449 for (int n = start; n < end; n++) {
452 ExtenderType ext = GetExtenderType (i);
453 if (ext != ExtenderType.None) {
454 i = FilterExtender (previousChar, ext);
456 FillSortKeyRaw (i, ext);
457 else if (previousSortKey != null) {
458 byte [] b = previousSortKey;
462 b [2] != 1 ? b [2] : Level2 (i, ext),
463 b [3] != 1 ? b [3] : Uni.Level3 (i));
465 // otherwise do nothing.
466 // (if the extender is the first char
467 // in the string, then just ignore.)
473 i = FilterOptions (i);
475 Contraction ct = GetContraction (s, n, end);
477 if (ct.Replacement != null) {
478 GetSortKey (ct.Replacement, 0, ct.Replacement.Length);
480 byte [] b = ct.SortKey;
484 b [2] != 1 ? b [2] : Level2 (i, ext),
485 b [3] != 1 ? b [3] : Uni.Level3 (i));
489 n += ct.Source.Length - 1;
492 if (!Uni.IsIgnorableNonSpacing (i))
494 FillSortKeyRaw (i, ExtenderType.None);
499 void FillSortKeyRaw (int i, ExtenderType ext)
501 if (0x3400 <= i && i <= 0x4DB5) {
502 int diff = i - 0x3400;
503 buf.AppendCJKExtension (
504 (byte) (0x10 + diff / 254),
505 (byte) (diff % 254 + 2));
509 UnicodeCategory uc = char.GetUnicodeCategory ((char) i);
511 case UnicodeCategory.PrivateUse:
512 int diff = i - 0xE000;
514 (byte) (0xE5 + diff / 254),
515 (byte) (diff % 254 + 2),
519 case UnicodeCategory.Surrogate:
520 FillSurrogateSortKeyRaw (i);
524 byte level2 = Level2 (i, ext);
525 if (Uni.HasSpecialWeight ((char) i)) {
526 byte level1 = Level1 (i);
532 Uni.IsJapaneseSmallLetter ((char) i),
533 ToDashTypeValue (ext),
534 !Uni.IsHiragana ((char) i),
535 IsHalfKana ((char) i)
537 if (!ignoreNonSpace && ext == ExtenderType.Voiced)
538 // Append voice weight
539 buf.AppendNormal (1, 1, 1, 0);
549 void FillSurrogateSortKeyRaw (int i)
558 lower = (byte) ((i == 0xD800) ? 0x3E : 0x3F);
559 } else if (0xD840 <= i && i < 0xD880) {
563 } else if (0xDB80 <= i && i < 0xDC00) {
564 diffbase = 0xDB80 - 0x40;
568 diffbase = 0xDC00 - 0xF8 + 2;
572 int diff = i - diffbase;
575 (byte) (segment + diff / 254),
576 (byte) (diff % 254 + 2),
585 public int Compare (string s1, string s2)
587 return Compare (s1, s2, CompareOptions.None);
590 public int Compare (string s1, string s2, CompareOptions options)
592 return Compare (s1, 0, s1.Length, s2, 0, s2.Length, options);
597 public string Source;
604 // Those instances are reused not to invoke instantiation
606 Escape escape1 = new Escape ();
607 Escape escape2 = new Escape ();
609 private int CompareOrdinal (string s1, int idx1, int len1,
610 string s2, int idx2, int len2)
612 int min = len1 < len2 ? len1 : len2;
613 int end1 = idx1 + min;
614 int end2 = idx2 + min;
615 for (int i1 = idx1, i2 = idx2;
616 i1 < end1 && i2 < end2; i1++, i2++)
617 if (s1 [i1] != s2 [i2])
618 return s1 [i1] - s2 [i2];
619 return len1 == len2 ? 0 :
620 len1 == min ? - 1 : 1;
623 public int Compare (string s1, int idx1, int len1,
624 string s2, int idx2, int len2, CompareOptions options)
626 // quick equality check
627 if (idx1 == idx2 && len1 == len2 &&
628 Object.ReferenceEquals (s1, s2))
630 // FIXME: this should be done inside Compare() at
632 // int ord = CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
635 if (options == CompareOptions.Ordinal)
636 return CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
638 #if false // stable easy version, depends on GetSortKey().
639 SortKey sk1 = GetSortKey (s1, idx1, len1, options);
640 SortKey sk2 = GetSortKey (s2, idx2, len2, options);
641 byte [] d1 = sk1.KeyData;
642 byte [] d2 = sk2.KeyData;
643 int len = d1.Length > d2.Length ? d2.Length : d1.Length;
644 for (int i = 0; i < len; i++)
645 if (d1 [i] != d2 [i])
646 return d1 [i] < d2 [i] ? -1 : 1;
647 return d1.Length == d2.Length ? 0 : d1.Length < d2.Length ? -1 : 1;
649 SetOptions (options);
651 int ret = CompareInternal (s1, idx1, len1, s2, idx2, len2, (options & CompareOptions.StringSort) != 0, out dummy, out dummy2, true);
652 return ret == 0 ? 0 : ret < 0 ? -1 : 1;
656 int CompareInternal (string s1, int idx1, int len1, string s2,
657 int idx2, int len2, bool stringSort,
658 out bool targetConsumed, out bool sourceConsumed,
659 bool skipHeadingExtenders)
663 int end1 = idx1 + len1;
664 int end2 = idx2 + len2;
665 targetConsumed = false;
666 sourceConsumed = false;
668 // It holds final result that comes from the comparison
669 // at level 2 or lower. Even if Compare() found the
670 // difference at level 2 or lower, it still has to
671 // continue level 1 comparison. FinalResult is used
672 // when there was no further differences.
674 // It holds the comparison level to do. It starts from
675 // 5, and becomes 1 when we do primary-only comparison.
676 int currentLevel = 5;
683 // Skip heading extenders
684 if (skipHeadingExtenders) {
685 for (; idx1 < end1; idx1++)
686 if (GetExtenderType (s1 [idx1]) == ExtenderType.None)
688 for (; idx2 < end2; idx2++)
689 if (GetExtenderType (s2 [idx2]) == ExtenderType.None)
693 ExtenderType ext1 = ExtenderType.None;
694 ExtenderType ext2 = ExtenderType.None;
696 int quickCheckPos1 = idx1;
697 int quickCheckPos2 = idx2;
700 for (; idx1 < end1; idx1++)
701 if (!IsIgnorable (s1 [idx1]))
703 for (; idx2 < end2; idx2++)
704 if (!IsIgnorable (s2 [idx2]))
708 if (escape1.Source == null)
711 start1 = escape1.Start;
712 idx1 = escape1.Index;
714 quickCheckPos1 = escape1.Optional;
715 escape1.Source = null;
719 if (escape2.Source == null)
722 start2 = escape2.Start;
723 idx2 = escape2.Index;
725 quickCheckPos2 = escape2.Optional;
726 escape2.Source = null;
730 // If comparison is unstable, then this part is one of the most doubtful part.
731 // Here we run quick codepoint comparison and run back to "stable" area to
732 // compare characters.
734 // Strictly to say, even the first character
735 // could be compared here, but it messed
736 // backward step, so I just avoided mess.
737 if (quickCheckPos1 < idx1 && quickCheckPos2 < idx2) {
738 while (idx1 < end1 && idx2 < end2 &&
739 s1 [idx1] == s2 [idx2]) {
743 if (idx1 == end1 || idx2 == end2)
744 continue; // check replacement
746 int backwardEnd1 = quickCheckPos1;
747 int backwardEnd2 = quickCheckPos2;
748 quickCheckPos1 = idx1;
749 quickCheckPos2 = idx2;
754 for (;idx1 > backwardEnd1; idx1--)
755 if (Category (s1 [idx1]) != 1)
757 for (;idx2 > backwardEnd2; idx2--)
758 if (Category (s2 [idx2]) != 1)
760 for (;idx1 > backwardEnd1; idx1--)
761 if (IsSafe (s1 [idx1]))
763 for (;idx2 > backwardEnd2; idx2--)
764 if (IsSafe (s2 [idx2]))
773 int i1 = FilterOptions (s1 [idx1]);
774 int i2 = FilterOptions (s2 [idx2]);
775 bool special1 = false;
776 bool special2 = false;
778 // If current character is an expander, then
779 // repeat the previous character.
780 ext1 = GetExtenderType (i1);
781 if (ext1 != ExtenderType.None) {
782 if (previousChar < 0) {
783 if (previousSortKey == null) {
788 sk1 = previousSortKey;
791 i1 = FilterExtender (previousChar, ext1);
793 ext2 = GetExtenderType (i2);
794 if (ext2 != ExtenderType.None) {
795 if (previousChar2 < 0) {
796 if (previousSortKey2 == null) {
801 sk2 = previousSortKey2;
804 i2 = FilterExtender (previousChar2, ext2);
807 byte cat1 = Category (i1);
808 byte cat2 = Category (i2);
810 // Handle special weight characters
811 if (!stringSort && currentLevel > 4) {
813 lv5At1 = escape1.Source != null ?
814 escape1.Index - escape1.Start :
816 // here Windows has a bug that it does
817 // not consider thirtiary weight.
818 lv5Value1 = Level1 (i1) << 8 + Uni.Level3 (i1);
823 lv5At2 = escape2.Source != null ?
824 escape2.Index - escape2.Start :
826 // here Windows has a bug that it does
827 // not consider thirtiary weight.
828 lv5Value2 = Level1 (i2) << 8 + Uni.Level3 (i2);
832 if (cat1 == 6 || cat2 == 6) {
838 Contraction ct1 = null;
839 if (ext1 == ExtenderType.None)
840 ct1 = GetContraction (s1, idx1, end1);
845 else if (ct1 != null) {
846 offset1 = ct1.Source.Length;
847 if (ct1.SortKey != null) {
849 for (int i = 0; i < ct1.SortKey.Length; i++)
850 sk1 [i] = ct1.SortKey [i];
852 previousSortKey = sk1;
854 else if (escape1.Source == null) {
856 escape1.Start = start1;
857 escape1.Index = cur1 + ct1.Source.Length;
859 escape1.Optional = quickCheckPos1;
860 s1 = ct1.Replacement;
871 sk1 [1] = Level1 (i1);
872 if (!ignoreNonSpace && currentLevel > 1)
873 sk1 [2] = Level2 (i1, ext1);
874 if (currentLevel > 2)
875 sk1 [3] = Uni.Level3 (i1);
876 if (currentLevel > 3)
877 special1 = Uni.HasSpecialWeight ((char) i1);
882 Contraction ct2 = null;
883 if (ext2 == ExtenderType.None)
884 ct2 = GetContraction (s2, idx2, end2);
888 else if (ct2 != null) {
889 idx2 += ct2.Source.Length;
890 if (ct2.SortKey != null) {
892 for (int i = 0; i < ct2.SortKey.Length; i++)
893 sk2 [i] = ct2.SortKey [i];
895 previousSortKey2 = sk2;
897 else if (escape2.Source == null) {
899 escape2.Start = start2;
900 escape2.Index = cur2 + ct2.Source.Length;
902 escape2.Optional = quickCheckPos2;
903 s2 = ct2.Replacement;
914 sk2 [1] = Level1 (i2);
915 if (!ignoreNonSpace && currentLevel > 1)
916 sk2 [2] = Level2 (i2, ext2);
917 if (currentLevel > 2)
918 sk2 [3] = Uni.Level3 (i2);
919 if (currentLevel > 3)
920 special2 = Uni.HasSpecialWeight ((char) i2);
926 // Add offset here so that it does not skip
927 // idx1 while s2 has a replacement.
930 // add diacritical marks in s1 here
931 if (!ignoreNonSpace) {
932 while (idx1 < end1) {
933 if (Category (s1 [idx1]) != 1)
937 sk1 [2] = (byte) (sk1 [2] +
938 Level2 (s1 [idx1], ExtenderType.None));
942 // add diacritical marks in s2 here
943 while (idx2 < end2) {
944 if (Category (s2 [idx2]) != 1)
948 sk2 [2] = (byte) (sk2 [2] +
949 Level2 (s2 [idx2], ExtenderType.None));
954 int ret = sk1 [0] - sk2 [0];
955 ret = ret != 0 ? ret : sk1 [1] - sk2 [1];
958 if (currentLevel == 1)
960 if (!ignoreNonSpace) {
961 ret = sk1 [2] - sk2 [2];
964 currentLevel = frenchSort ? 2 : 1;
968 if (currentLevel == 2)
970 ret = sk1 [3] - sk2 [3];
976 if (currentLevel == 3)
978 if (special1 != special2) {
979 finalResult = special1 ? 1 : -1;
984 ret = CompareFlagPair (
985 !Uni.IsJapaneseSmallLetter ((char) i1),
986 !Uni.IsJapaneseSmallLetter ((char) i2));
987 ret = ret != 0 ? ret :
988 ToDashTypeValue (ext1) -
989 ToDashTypeValue (ext2);
990 ret = ret != 0 ? ret : CompareFlagPair (
991 Uni.IsHiragana ((char) i1),
992 Uni.IsHiragana ((char) i2));
993 ret = ret != 0 ? ret : CompareFlagPair (
994 !IsHalfKana ((char) i1),
995 !IsHalfKana ((char) i2));
1004 // If there were only level 3 or lower differences,
1005 // then we still have to find diacritical differences
1007 if (!ignoreNonSpace &&
1008 finalResult != 0 && currentLevel > 2) {
1009 while (idx1 < end1 && idx2 < end2) {
1010 if (!Uni.IsIgnorableNonSpacing (s1 [idx1]))
1012 if (!Uni.IsIgnorableNonSpacing (s2 [idx2]))
1014 finalResult = Level2 (FilterOptions ((s1 [idx1])), ext1) - Level2 (FilterOptions (s2 [idx2]), ext2);
1015 if (finalResult != 0)
1019 // they should work only for the first character
1020 ext1 = ExtenderType.None;
1021 ext2 = ExtenderType.None;
1024 if (currentLevel == 1 && finalResult != 0) {
1026 if (Uni.IsIgnorableNonSpacing (s1 [idx1]))
1029 if (Uni.IsIgnorableNonSpacing (s2 [idx2]))
1032 // we still have to handle level 5
1033 if (finalResult == 0) {
1034 finalResult = lv5At1 - lv5At2;
1035 if (finalResult == 0)
1036 finalResult = lv5Value1 - lv5Value2;
1038 if (finalResult == 0) {
1040 targetConsumed = true;
1042 sourceConsumed = true;
1044 return idx1 != end1 ? 1 : idx2 == end2 ? finalResult : -1;
1047 int CompareFlagPair (bool b1, bool b2)
1049 return b1 == b2 ? 0 : b1 ? 1 : -1;
1054 #region IsPrefix() and IsSuffix()
1056 public bool IsPrefix (string src, string target, CompareOptions opt)
1058 return IsPrefix (src, target, 0, src.Length, opt);
1061 public bool IsPrefix (string s, string target, int start, int length, CompareOptions opt)
1064 return IsPrefix (s, target, start, length,
1065 (opt & CompareOptions.StringSort) != 0, true);
1068 public bool IsPrefix (string s, string target, int start, int length, bool stringSort, bool skipHeadingExtenders)
1070 bool consumed, dummy;
1071 int ret = CompareInternal (s, start, length,
1072 target, 0, target.Length, stringSort,
1073 out consumed, out dummy, skipHeadingExtenders);
1079 public bool IsSuffix (string src, string target, CompareOptions opt)
1081 return IsSuffix (src, target, src.Length - 1, src.Length, opt);
1084 public bool IsSuffix (string s, string target, int start, int length, CompareOptions opt)
1086 // quick check : simple codepoint comparison
1087 if (s.Length >= target.Length) {
1089 int se = start - length;
1090 for (int i = target.Length - 1; si >= se && i >= 0; i--, si--)
1091 if (s [si] != target [i])
1093 if (si == start + target.Length)
1098 return IsSuffix (s, target, start, length,
1099 (opt & CompareOptions.StringSort) != 0);
1102 bool IsSuffix (string s, string t, int start, int length,
1106 for (;tstart < t.Length; tstart++)
1107 if (!IsIgnorable (t [tstart]))
1109 if (tstart == t.Length)
1110 return true; // as if target is String.Empty.
1113 // This is still not working. If it is not likely to get working, then just remove it.
1115 int send = start - length;
1116 int ti = t.Length - 1;
1119 int sStep = start + 1;
1120 int tStep = t.Length;
1121 bool sourceConsumed, targetConsumed;
1123 for (; send < si; si--)
1124 if (!IsIgnorable (s [si]))
1126 for (; tend < ti; ti--)
1127 if (!IsIgnorable (t [ti]))
1131 for (; send < si; si--)
1132 if (IsSafe (s [si]))
1134 for (; tend < ti; ti--)
1135 if (IsSafe (t [ti]))
1137 Console.WriteLine ("==== {0} {1} {2} {3} {4} {5} {6} {7} {8}", s, si, send, length, t, ti, tstart, sStep - si, tStep - ti);
1138 if (CompareInternal (s, si, sStep - si,
1139 t, ti, tStep - ti, stringSort,
1140 out targetConsumed, out sourceConsumed,
1152 // FIXME: it is not efficient for very long target.
1153 bool sourceConsumed, targetConsumed;
1154 int mismatchCount = 0;
1155 for (int i = 0; i < length; i++) {
1156 escape1.Source = escape2.Source = null;
1157 previousSortKey = previousSortKey2 = null;
1158 previousChar = previousChar2 = -1;
1160 int ret = CompareInternal (s, start - i, i + 1,
1161 t, tstart, t.Length - tstart,
1162 stringSort, out targetConsumed,
1163 out sourceConsumed, true);
1166 if (!sourceConsumed && targetConsumed)
1168 if (!targetConsumed) {
1170 if (mismatchCount > Uni.MaxExpansionLength)
1171 // The largest length of an
1172 // expansion is 3, so if the
1173 // target was not consumed more
1174 // than 3 times, then the tail
1175 // character does not match.
1185 #region IndexOf() / LastIndexOf()
1187 public int IndexOf (string s, string target, CompareOptions opt)
1189 return IndexOf (s, target, 0, s.Length, opt);
1192 public int IndexOf (string s, string target, int start, int length, CompareOptions opt)
1195 return IndexOf (s, target, start, length,
1196 (opt & CompareOptions.StringSort) != 0);
1199 public int IndexOf (string s, char target, CompareOptions opt)
1201 return IndexOf (s, target, 0, s.Length, opt);
1204 public int IndexOf (string s, char target, int start, int length, CompareOptions opt)
1208 // If target is contraction, then use string search.
1209 Contraction ct = GetContraction (target);
1211 if (ct.Replacement != null)
1212 return IndexOf (s, ct.Replacement, start, length,
1213 (opt & CompareOptions.StringSort) != 0);
1215 return IndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true);
1217 int ti = FilterOptions ((int) target);
1218 charSortKeyIndexTarget [0] = Category (ti);
1219 charSortKeyIndexTarget [1] = Level1 (ti);
1220 if (!ignoreNonSpace)
1221 charSortKeyIndexTarget [2] =
1222 Level2 (ti, ExtenderType.None);
1223 charSortKeyIndexTarget [3] = Uni.Level3 (ti);
1224 return IndexOfSortKey (s, start, length,
1225 charSortKeyIndexTarget, target, ti,
1226 !Uni.HasSpecialWeight ((char) ti));
1230 // Searches target byte[] keydata
1231 int IndexOfSortKey (string s, int start, int length, byte [] sortkey, char target, int ti, bool noLv4)
1233 int end = start + length;
1237 if (MatchesForward (s, ref idx, end, ti, sortkey, noLv4))
1243 // Searches string. Search head character (or keydata when
1244 // the head is contraction sortkey) and try IsPrefix().
1245 int IndexOf (string s, string target, int start, int length, bool stringSort)
1248 for (; tidx < target.Length; tidx++)
1249 if (!IsIgnorable (target [tidx]))
1251 if (tidx == target.Length)
1253 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1254 string replace = ct != null ? ct.Replacement : null;
1255 byte [] sk = replace == null ? charSortKeyIndexTarget : null;
1257 char tc = char.MinValue;
1259 if (ct != null && sk != null) {
1260 for (int i = 0; i < ct.SortKey.Length; i++)
1261 sk [i] = ct.SortKey [i];
1262 } else if (sk != null) {
1264 ti = FilterOptions (target [tidx]);
1265 sk [0] = Category (ti);
1266 sk [1] = Level1 (ti);
1267 if (!ignoreNonSpace)
1268 sk [2] = Level2 (ti, ExtenderType.None);
1269 sk [3] = Uni.Level3 (ti);
1270 noLv4 = !Uni.HasSpecialWeight ((char) ti);
1273 for (tidx++; tidx < target.Length; tidx++) {
1274 if (Category (target [tidx]) != 1)
1278 sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None));
1284 if (replace != null)
1285 idx = IndexOf (s, replace, start, length, stringSort);
1287 idx = IndexOfSortKey (s, start, length, sk, tc, ti, noLv4);
1290 length -= idx - start;
1292 if (IsPrefix (s, target, start, length, stringSort, false))
1294 Contraction cts = GetContraction (s, start, length);
1296 start += cts.Source.Length;
1297 length -= cts.Source.Length;
1302 } while (length > 0);
1307 // There are the same number of IndexOf() related methods,
1308 // with the same functionalities for each.
1311 public int LastIndexOf (string s, string target, CompareOptions opt)
1313 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1316 public int LastIndexOf (string s, string target, int start, int length, CompareOptions opt)
1319 return LastIndexOf (s, target, start, length,
1320 (opt & CompareOptions.StringSort) != 0);
1323 public int LastIndexOf (string s, char target, CompareOptions opt)
1325 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1328 public int LastIndexOf (string s, char target, int start, int length, CompareOptions opt)
1332 // If target is contraction, then use string search.
1333 Contraction ct = GetContraction (target);
1335 if (ct.Replacement != null)
1336 return LastIndexOf (s, ct.Replacement, start, length, (opt & CompareOptions.StringSort) != 0);
1338 return LastIndexOfSortKey (s, start, start, length, ct.SortKey, char.MinValue, -1, true);
1341 int ti = FilterOptions ((int) target);
1342 charSortKeyIndexTarget [0] = Category (ti);
1343 charSortKeyIndexTarget [1] = Level1 (ti);
1344 if (!ignoreNonSpace)
1345 charSortKeyIndexTarget [2] = Level2 (ti, ExtenderType.None);
1346 charSortKeyIndexTarget [3] = Uni.Level3 (ti);
1347 return LastIndexOfSortKey (s, start, start, length, charSortKeyIndexTarget, target, ti, !Uni.HasSpecialWeight ((char) ti));
1351 // Searches target byte[] keydata
1352 int LastIndexOfSortKey (string s, int start, int orgStart, int length, byte [] sortkey, char target, int ti, bool noLv4)
1354 int end = start - length;
1358 if (MatchesBackward (s, ref idx, end, orgStart, ti, sortkey, noLv4))
1364 // Searches string. Search head character (or keydata when
1365 // the head is contraction sortkey) and try IsPrefix().
1366 int LastIndexOf (string s, string target, int start, int length, bool stringSort)
1368 int orgStart = start;
1370 for (; tidx < target.Length; tidx++)
1371 if (!IsIgnorable (target [tidx]))
1373 if (tidx == target.Length)
1375 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1376 string replace = ct != null ? ct.Replacement : null;
1377 byte [] sk = replace == null ? charSortKeyIndexTarget : null;
1380 char tc = char.MinValue;
1382 if (ct != null && sk != null) {
1383 for (int i = 0; i < ct.SortKey.Length; i++)
1384 sk [i] = ct.SortKey [i];
1385 } else if (sk != null) {
1387 ti = FilterOptions (target [tidx]);
1388 sk [0] = Category (ti);
1389 sk [1] = Level1 (ti);
1390 if (!ignoreNonSpace)
1391 sk [2] = Level2 (ti, ExtenderType.None);
1392 sk [3] = Uni.Level3 (ti);
1393 noLv4 = !Uni.HasSpecialWeight ((char) ti);
1396 for (tidx++; tidx < target.Length; tidx++) {
1397 if (Category (target [tidx]) != 1)
1401 sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None));
1408 if (replace != null)
1409 idx = LastIndexOf (s, replace, start, length, stringSort);
1411 idx = LastIndexOfSortKey (s, start, orgStart, length, sk, tc, ti, noLv4);
1414 length -= start - idx;
1417 if (IsPrefix (s, target, idx, orgStart - idx + 1, stringSort, false)) {
1418 for (;idx < orgStart; idx++)
1419 if (!IsIgnorable (s [idx]))
1423 Contraction cts = GetContraction (s, idx, orgStart - idx + 1);
1425 start -= cts.Source.Length;
1426 length -= cts.Source.Length;
1431 } while (length > 0);
1435 private bool MatchesForward (string s, ref int idx, int end, int ti, byte [] sortkey, bool noLv4)
1438 ExtenderType ext = GetExtenderType (s [idx]);
1439 Contraction ct = null;
1440 if (ext == ExtenderType.None)
1441 ct = GetContraction (s, idx, end);
1442 else if (previousChar < 0) {
1443 if (previousSortKey == null) {
1447 charSortKey = previousSortKey;
1450 si = FilterExtender (previousChar, ext);
1451 // if lv4 exists, it never matches contraction
1453 idx += ct.Source.Length;
1456 if (ct.SortKey != null) {
1457 for (int i = 0; i < sortkey.Length; i++)
1458 charSortKey [i] = sortkey [i];
1460 previousSortKey = charSortKey;
1462 // Here is the core of LAMESPEC
1463 // described at the top of the source.
1465 return MatchesForward (ct.Replacement, ref dummy,
1466 ct.Replacement.Length, ti, sortkey, noLv4);
1470 si = FilterOptions (s [idx]);
1471 charSortKey [0] = Category (si);
1472 charSortKey [1] = Level1 (si);
1473 if (!ignoreNonSpace)
1474 charSortKey [2] = Level2 (si, ext);
1475 charSortKey [3] = Uni.Level3 (si);
1476 if (charSortKey [0] != 1)
1480 for (; idx < end; idx++) {
1481 if (Category (s [idx]) != 1)
1485 if (charSortKey [2] == 0)
1486 charSortKey [2] = 2;
1487 charSortKey [2] = (byte) (charSortKey [2]
1488 + Level2 (s [idx], ExtenderType.None));
1491 return MatchesPrimitive (charSortKey, si, ext, sortkey, ti, noLv4);
1494 private bool MatchesPrimitive (byte [] charSortKey, int si, ExtenderType ext, byte [] sortkey, int ti, bool noLv4)
1496 if (charSortKey [0] != sortkey [0] ||
1497 charSortKey [1] != sortkey [1] ||
1498 (!ignoreNonSpace && charSortKey [2] != sortkey [2]) ||
1499 charSortKey [3] != sortkey [3])
1501 if (noLv4 && (si < 0 || !Uni.HasSpecialWeight ((char) si)))
1505 // Since target can never be an extender, if the source
1506 // is an expander and it matters, then they never match.
1507 if (!ignoreNonSpace && ext == ExtenderType.Conditional)
1509 if (Uni.IsJapaneseSmallLetter ((char) si) !=
1510 Uni.IsJapaneseSmallLetter ((char) ti) ||
1511 ToDashTypeValue (ext) !=
1512 // FIXME: we will have to specify correct value for target
1513 ToDashTypeValue (ExtenderType.None) ||
1514 !Uni.IsHiragana ((char) si) !=
1515 !Uni.IsHiragana ((char) ti) ||
1516 IsHalfKana ((char) si) !=
1517 IsHalfKana ((char) ti))
1522 private bool MatchesBackward (string s, ref int idx, int end, int orgStart, int ti, byte [] sortkey, bool noLv4)
1526 ExtenderType ext = GetExtenderType (s [idx]);
1527 // To handle extenders in source, we need to
1528 // check next _primary_ character.
1529 if (ext != ExtenderType.None) {
1530 byte diacritical = 0;
1531 for (int tmp = 0; ; tmp--) {
1532 if (tmp < 0) // heading extender
1534 if (IsIgnorable (s [tmp]))
1536 int tmpi = FilterOptions (s [tmp]);
1537 byte category = Category (tmpi);
1538 if (category == 1) {
1539 diacritical = Level2 (tmpi, ExtenderType.None);
1542 si = FilterExtender (tmpi, ext);
1543 charSortKey [0] = category;
1544 charSortKey [1] = Level1 (si);
1545 if (!ignoreNonSpace)
1546 charSortKey [2] = Level2 (si, ext);
1547 charSortKey [3] = Uni.Level3 (si);
1548 if (ext != ExtenderType.Conditional &&
1551 (charSortKey [2] == 0) ?
1552 (byte) (diacritical + 2) :
1558 Contraction ct = null;
1559 if (ext == ExtenderType.None)
1560 ct = GetContraction (s, idx, end);
1561 // if lv4 exists, it never matches contraction
1563 idx -= ct.Source.Length;
1566 if (ct.SortKey != null) {
1567 for (int i = 0; i < sortkey.Length; i++)
1568 charSortKey [i] = sortkey [i];
1570 previousSortKey = charSortKey;
1572 // Here is the core of LAMESPEC
1573 // described at the top of the source.
1574 int dummy = ct.Replacement.Length - 1;
1575 return MatchesBackward (ct.Replacement,
1576 ref dummy, dummy, -1, ti, sortkey, noLv4);
1578 } else if (ext == ExtenderType.None) {
1580 si = FilterOptions (s [idx]);
1581 charSortKey [0] = Category (si);
1582 charSortKey [1] = Level1 (si);
1583 if (!ignoreNonSpace)
1584 charSortKey [2] = Level2 (si, ext);
1585 charSortKey [3] = Uni.Level3 (si);
1586 if (charSortKey [0] != 1)
1590 if (ext == ExtenderType.None) {
1591 for (int tmp = cur + 1; tmp < orgStart; tmp++) {
1592 if (Category (s [tmp]) != 1)
1596 if (charSortKey [2] == 0)
1597 charSortKey [2] = 2;
1598 charSortKey [2] = (byte) (charSortKey [2]
1599 + Level2 (s [tmp], ExtenderType.None));
1602 return MatchesPrimitive (charSortKey, si, ext, sortkey, ti, noLv4);