4 // This class will demonstrate CompareInfo functionality that will just work.
8 // Here's a summary for supporting contractions, expansions and diacritical
11 // Diacritical mapping is a simple tailoring rule that "remaps" diacritical
12 // weight value from one to another. For now it applies to all range of
13 // characters, but at some stage we might need to limit the remapping ranges.
15 // A Contraction consists of a string (actually char[]) and a sortkey entry
16 // (i.e. byte[]). It indicates that a sequence of characters is interpreted
17 // as a single character that has the mapped sortkey value. There is no
18 // character which goes across "special rules". When the collator encountered
19 // such a sequence of characters, it just returns the sortkey value without
20 // further replacement.
22 // Since it is still likely to happen that a contraction sequence matches
23 // other character than the identical sequence (depending on CompareOptions
24 // and of course, defined sortkey value itself), comparison cannot be done
25 // at source char[] level.
31 // In IndexOf(), the first character in the target string or the target char
32 // itself is turned into sortkey bytes. If the character has a contraction and
33 // that is sortkey map, then it is used instead. If the contraction exists and
34 // that is replacement map, then the first character of the replacement string
35 // is searched instead. IndexOf() always searches only for the top character,
36 // and if it returned negative value, then it returns -1. Otherwise, it then
37 // tries IsPrefix() from that location. If it returns true, then it returns
41 // LAMESPEC: IndexOf() is lame as a whole API. It never matches in the middle
42 // of expansion and there is no proper way to return such indexes within
43 // a single int return value.
45 // For example, try below in .NET:
46 // IndexOf("\u00E6", "a")
47 // IndexOf("\u00E6", "e")
52 using System.Collections;
53 using System.Globalization;
55 using Uni = Mono.Globalization.Unicode.MSCompatUnicodeTable;
56 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
58 namespace Mono.Globalization.Unicode
60 internal class SimpleCollator
62 static SimpleCollator invariant =
63 new SimpleCollator (CultureInfo.InvariantCulture);
65 internal static readonly byte [] ignorableFlags =
67 internal static readonly byte [] categories =
69 internal static readonly byte [] level1 =
71 internal static readonly byte [] level2 =
73 internal static readonly byte [] level3 =
75 internal static readonly ushort [] widthCompat =
77 internal static readonly CodePointIndexer categoryIndexer =
79 internal static readonly CodePointIndexer lv1Indexer =
81 internal static readonly CodePointIndexer lv2Indexer =
83 internal static readonly CodePointIndexer lv3Indexer =
85 internal static readonly CodePointIndexer widthIndexer =
90 // CompareOptions expanded.
91 bool ignoreNonSpace; // used in IndexOf()
96 TextInfo textInfo; // for ToLower().
98 readonly ushort [] cjkTable;
99 readonly CodePointIndexer cjkIndexer;
100 readonly byte [] cjkLv2Table;
101 readonly CodePointIndexer cjkLv2Indexer;
103 readonly Contraction [] contractions;
104 readonly Level2Map [] level2Maps;
106 // temporary sortkey buffer for index search/comparison
107 byte [] charSortKey = new byte [4];
108 byte [] charSortKey2 = new byte [4];
109 byte [] charSortKeyIndexTarget = new byte [4];
111 #region Tailoring support classes
112 // Possible mapping types are:
114 // - string to string (ReplacementMap)
115 // - string to SortKey (SortKeyMap)
116 // - diacritical byte to byte (DiacriticalMap)
118 // There could be mapping from string to sortkeys, but
119 // for now there is none as such.
121 internal class Contraction
123 public readonly char [] Source;
124 // only either of them is used.
125 public readonly string Replacement;
126 public readonly byte [] SortKey;
128 public Contraction (char [] source,
129 string replacement, byte [] sortkey)
132 Replacement = replacement;
137 internal class ContractionComparer : IComparer
139 public static readonly ContractionComparer Instance =
140 new ContractionComparer ();
142 public int Compare (object o1, object o2)
144 Contraction c1 = (Contraction) o1;
145 Contraction c2 = (Contraction) o2;
146 char [] a1 = c1.Source;
147 char [] a2 = c2.Source;
148 int min = a1.Length > a2.Length ?
149 a2.Length : a1.Length;
150 for (int i = 0; i < min; i++)
151 if (a1 [i] != a2 [i])
152 return a1 [i] - a2 [i];
153 return a1.Length - a2.Length;
157 internal class Level2Map
162 public Level2Map (byte source, byte replace)
169 internal class Level2MapComparer : IComparer
171 public static readonly Level2MapComparer Instance =
172 new Level2MapComparer ();
174 public int Compare (object o1, object o2)
176 Level2Map m1 = (Level2Map) o1;
177 Level2Map m2 = (Level2Map) o2;
178 return (m1.Source - m2.Source);
184 #region .ctor() and split functions
186 public SimpleCollator (CultureInfo culture)
189 textInfo = culture.TextInfo;
190 buf = new SortKeyBuffer (culture.LCID);
192 SetCJKTable (culture, ref cjkTable, ref cjkIndexer,
193 ref cjkLv2Table, ref cjkLv2Indexer);
195 // Get tailoring info
196 TailoringInfo t = null;
197 for (CultureInfo ci = culture; ci.LCID != 127; ci = ci.Parent) {
198 t = Uni.GetTailoringInfo (ci.LCID);
202 if (t == null) // then use invariant
203 t = Uni.GetTailoringInfo (127);
205 frenchSort = t.FrenchSort;
206 BuildTailoringTables (culture, t, ref contractions,
208 // FIXME: Since tailorings are mostly for latin
209 // (and in some cases Cyrillic) characters, it would
210 // be much better for performance to store "start
211 // indexes" for > 370 (culture-specific letters).
214 // dump tailoring table
215 Console.WriteLine ("******** building table for {0} : c - {1} d - {2}",
216 culture.LCID, contractions.Length, level2Maps.Length);
217 foreach (Contraction c in contractions) {
218 foreach (char cc in c.Source)
219 Console.Write ("{0:X4} ", (int) cc);
220 Console.WriteLine (" -> '{0}'", c.Replacement);
225 private void BuildTailoringTables (CultureInfo culture,
227 ref Contraction [] contractions,
228 ref Level2Map [] diacriticals)
230 // collect tailoring entries.
231 ArrayList cmaps = new ArrayList ();
232 ArrayList dmaps = new ArrayList ();
233 char [] tarr = Uni.TailoringValues;
234 int idx = t.TailoringIndex;
235 int end = idx + t.TailoringCount;
239 switch (tarr [idx]) {
240 case '\x1': // SortKeyMap
242 while (tarr [ss] != 0)
244 src = new char [ss - idx];
245 Array.Copy (tarr, idx, src, 0, ss - idx);
246 byte [] sortkey = new byte [4];
247 for (int i = 0; i < 4; i++)
248 sortkey [i] = (byte) tarr [ss + 1 + i];
249 cmaps.Add (new Contraction (
250 src, null, sortkey));
254 case '\x2': // DiacriticalMap
255 dmaps.Add (new Level2Map (
256 (byte) tarr [idx + 1],
257 (byte) tarr [idx + 2]));
260 case '\x3': // ReplacementMap
262 while (tarr [ss] != 0)
264 src = new char [ss - idx];
265 Array.Copy (tarr, idx, src, 0, ss - idx);
268 while (tarr [l] != 0)
270 string r = new string (tarr, ss, l - ss);
271 cmaps.Add (new Contraction (
276 throw new NotImplementedException (String.Format ("Mono INTERNAL ERROR (Should not happen): Collation tailoring table is broken for culture {0} ({1}) at 0x{2:X}", culture.LCID, culture.Name, idx));
279 cmaps.Sort (ContractionComparer.Instance);
280 dmaps.Sort (Level2MapComparer.Instance);
281 contractions = cmaps.ToArray (typeof (Contraction))
283 diacriticals = dmaps.ToArray (typeof (Level2Map))
287 private void SetCJKTable (CultureInfo culture,
288 ref ushort [] cjkTable, ref CodePointIndexer cjkIndexer,
289 ref byte [] cjkLv2Table, ref CodePointIndexer cjkLv2Indexer)
291 string name = GetNeutralCulture (culture).Name;
295 // custom CJK table support.
298 cjkTable = Uni.CjkCHS;
299 cjkIndexer = UUtil.CjkCHS;
302 cjkTable = Uni.CjkCHT;
303 cjkIndexer = UUtil.Cjk;
306 cjkTable = Uni.CjkJA;
307 cjkIndexer = UUtil.Cjk;
310 cjkTable = Uni.CjkKO;
311 cjkLv2Table = Uni.CjkKOLv2;
312 cjkIndexer = UUtil.Cjk;
313 cjkLv2Indexer = UUtil.Cjk;
318 static CultureInfo GetNeutralCulture (CultureInfo info)
320 CultureInfo ret = info;
321 while (ret.Parent != null && ret.Parent.LCID != 127)
328 byte Category (int cp)
330 if (cp < 0x3000 || cjkTable == null)
331 return categories [categoryIndexer.ToIndex (cp)];
332 int idx = cjkIndexer.ToIndex (cp);
333 ushort cjk = idx < 0 ? (ushort) 0 : cjkTable [idx];
334 return cjk != 0 ? (byte) ((cjk & 0xFF00) >> 8) :
335 categories [categoryIndexer.ToIndex (cp)];
340 if (cp < 0x3000 || cjkTable == null)
341 return level1 [lv1Indexer.ToIndex (cp)];
342 int idx = cjkIndexer.ToIndex (cp);
343 ushort cjk = idx < 0 ? (ushort) 0 : cjkTable [idx];
344 return cjk != 0 ? (byte) (cjk & 0xFF) :
345 level1 [lv1Indexer.ToIndex (cp)];
348 byte Level2 (int cp, ExtenderType ext)
350 if (ext == ExtenderType.Buggy)
352 else if (ext == ExtenderType.Conditional)
355 if (cp < 0x3000 || cjkLv2Table == null)
356 return level2 [lv2Indexer.ToIndex (cp)];
357 int idx = cjkLv2Indexer.ToIndex (cp);
358 byte ret = idx < 0 ? (byte) 0 : cjkLv2Table [idx];
361 ret = level2 [lv2Indexer.ToIndex (cp)];
362 if (level2Maps.Length == 0)
364 for (int i = 0; i < level2Maps.Length; i++) {
365 if (level2Maps [i].Source == ret)
366 return level2Maps [i].Replace;
367 else if (level2Maps [i].Source > ret)
373 bool IsHalfKana (int cp)
375 return ignoreWidth || Uni.IsHalfWidthKana ((char) cp);
378 void SetOptions (CompareOptions options)
380 this.ignoreNonSpace = (options & CompareOptions.IgnoreNonSpace) != 0;
381 this.ignoreSymbols = (options & CompareOptions.IgnoreSymbols) != 0;
382 this.ignoreWidth = (options & CompareOptions.IgnoreWidth) != 0;
383 this.ignoreCase = (options & CompareOptions.IgnoreCase) != 0;
384 this.ignoreKanaType = (options & CompareOptions.IgnoreKanaType) != 0;
385 previousChar = previousChar2 = -1;
386 previousSortKey = previousSortKey2 = null;
387 escape1.Source = escape2.Source = null;
390 Contraction GetContraction (string s, int start, int end)
392 Contraction c = GetContraction (s, start, end, contractions);
393 if (c != null || lcid == 127)
395 return GetContraction (s, start, end, invariant.contractions);
398 Contraction GetContraction (string s, int start, int end, Contraction [] clist)
400 for (int i = 0; i < clist.Length; i++) {
401 Contraction ct = clist [i];
402 int diff = ct.Source [0] - s [start];
404 return null; // it's already sorted
407 char [] chars = ct.Source;
408 if (end - start < chars.Length)
411 for (int n = 0; n < chars.Length; n++)
412 if (s [start + n] != chars [n]) {
422 Contraction GetTailContraction (string s, int start, int end)
424 Contraction c = GetTailContraction (s, start, end, contractions);
425 if (c != null || lcid == 127)
427 return GetTailContraction (s, start, end, invariant.contractions);
430 Contraction GetTailContraction (string s, int start, int end, Contraction [] clist)
432 for (int i = 0; i < clist.Length; i++) {
433 Contraction ct = clist [i];
434 int diff = ct.Source [0] - s [end];
436 return null; // it's already sorted
439 char [] chars = ct.Source;
440 if (start - end + 1 < chars.Length)
443 int offset = start - chars.Length + 1;
444 for (int n = 0; n < chars.Length; n++)
445 if (s [offset + n] != chars [n]) {
455 Contraction GetContraction (char c)
457 Contraction ct = GetContraction (c, contractions);
458 if (ct != null || lcid == 127)
460 return GetContraction (c, invariant.contractions);
463 Contraction GetContraction (char c, Contraction [] clist)
465 for (int i = 0; i < clist.Length; i++) {
466 Contraction ct = clist [i];
467 if (ct.Source [0] > c)
468 return null; // it's already sorted
469 if (ct.Source [0] == c && ct.Source.Length == 1)
475 int FilterOptions (int i)
478 int x = widthCompat [widthIndexer.ToIndex (i)];
483 i = textInfo.ToLower ((char) i);
485 i = Uni.ToKanaTypeInsensitive (i);
489 int previousChar = -1;
490 byte [] previousSortKey = null;
491 int previousChar2 = -1;
492 byte [] previousSortKey2 = null;
502 ExtenderType GetExtenderType (int i)
504 // LAMESPEC: Windows expects true for U+3005, but
505 // sometimes it does not represent to repeat just
507 // Windows also expects true for U+3031 and U+3032,
508 // but they should *never* repeat one character.
510 if (i < 0x3005 || i > 0xFF70)
511 return ExtenderType.None;
516 return ExtenderType.Simple;
518 return ExtenderType.Conditional;
521 return ExtenderType.Voiced;
525 return ExtenderType.None;
527 case 0x3005: // LAMESPEC: see above.
528 return ExtenderType.Buggy;
529 case 0x3031: // LAMESPEC: see above.
530 case 0x3032: // LAMESPEC: see above.
533 return ExtenderType.Simple;
536 return ExtenderType.Voiced;
538 return ExtenderType.Conditional;
540 return ExtenderType.None;
544 byte ToDashTypeValue (ExtenderType ext)
546 if (ignoreNonSpace) // LAMESPEC: huh, why?
549 case ExtenderType.None:
551 case ExtenderType.Conditional:
558 int FilterExtender (int i, ExtenderType ext)
560 if (ext == ExtenderType.Conditional &&
561 Uni.HasSpecialWeight ((char) i)) {
562 bool half = IsHalfKana ((char) i);
563 bool katakana = !Uni.IsHiragana ((char) i);
564 switch (Level1 (i) & 7) {
566 return half ? 0xFF71 : katakana ? 0x30A2 : 0x3042;
568 return half ? 0xFF72 : katakana ? 0x30A4 : 0x3044;
570 return half ? 0xFF73 : katakana ? 0x30A6 : 0x3046;
572 return half ? 0xFF74 : katakana ? 0x30A8 : 0x3048;
574 return half ? 0xFF75 : katakana ? 0x30AA : 0x304A;
580 bool IsIgnorable (int i)
582 return Uni.IsIgnorable (i) ||
583 ignoreSymbols && Uni.IsIgnorableSymbol (i) ||
584 ignoreNonSpace && Uni.IsIgnorableNonSpacing (i);
590 public SortKey GetSortKey (string s)
592 return GetSortKey (s, CompareOptions.None);
595 public SortKey GetSortKey (string s, CompareOptions options)
597 return GetSortKey (s, 0, s.Length, options);
600 public SortKey GetSortKey (string s, int start, int length, CompareOptions options)
602 SetOptions (options);
604 buf.Initialize (options, lcid, s, frenchSort);
605 int end = start + length;
606 GetSortKey (s, start, end);
607 return buf.GetResultAndReset ();
610 void GetSortKey (string s, int start, int end)
612 for (int n = start; n < end; n++) {
615 ExtenderType ext = GetExtenderType (i);
616 if (ext != ExtenderType.None) {
617 i = FilterExtender (previousChar, ext);
619 FillSortKeyRaw (i, ext);
620 else if (previousSortKey != null) {
621 byte [] b = previousSortKey;
625 b [2] != 1 ? b [2] : Level2 (i, ext),
626 b [3] != 1 ? b [3] : level3 [lv3Indexer.ToIndex (i)]);
628 // otherwise do nothing.
629 // (if the extender is the first char
630 // in the string, then just ignore.)
636 i = FilterOptions (i);
638 Contraction ct = GetContraction (s, n, end);
640 if (ct.Replacement != null) {
641 GetSortKey (ct.Replacement, 0, ct.Replacement.Length);
643 byte [] b = ct.SortKey;
647 b [2] != 1 ? b [2] : Level2 (i, ext),
648 b [3] != 1 ? b [3] : level3 [lv3Indexer.ToIndex (i)]);
652 n += ct.Source.Length - 1;
655 if (!Uni.IsIgnorableNonSpacing (i))
657 FillSortKeyRaw (i, ExtenderType.None);
662 void FillSortKeyRaw (int i, ExtenderType ext)
664 if (0x3400 <= i && i <= 0x4DB5) {
665 int diff = i - 0x3400;
666 buf.AppendCJKExtension (
667 (byte) (0x10 + diff / 254),
668 (byte) (diff % 254 + 2));
672 UnicodeCategory uc = char.GetUnicodeCategory ((char) i);
674 case UnicodeCategory.PrivateUse:
675 int diff = i - 0xE000;
677 (byte) (0xE5 + diff / 254),
678 (byte) (diff % 254 + 2),
682 case UnicodeCategory.Surrogate:
683 FillSurrogateSortKeyRaw (i);
687 byte level2 = Level2 (i, ext);
688 if (Uni.HasSpecialWeight ((char) i)) {
689 byte level1 = Level1 (i);
695 Uni.IsJapaneseSmallLetter ((char) i),
696 ToDashTypeValue (ext),
697 !Uni.IsHiragana ((char) i),
698 IsHalfKana ((char) i)
700 if (!ignoreNonSpace && ext == ExtenderType.Voiced)
701 // Append voice weight
702 buf.AppendNormal (1, 1, 1, 0);
712 void FillSurrogateSortKeyRaw (int i)
721 lower = (byte) ((i == 0xD800) ? 0x3E : 0x3F);
722 } else if (0xD840 <= i && i < 0xD880) {
726 } else if (0xDB80 <= i && i < 0xDC00) {
727 diffbase = 0xDB80 - 0x40;
731 diffbase = 0xDC00 - 0xF8 + 2;
735 int diff = i - diffbase;
738 (byte) (segment + diff / 254),
739 (byte) (diff % 254 + 2),
748 public int Compare (string s1, string s2)
750 return Compare (s1, s2, CompareOptions.None);
753 public int Compare (string s1, string s2, CompareOptions options)
755 return Compare (s1, 0, s1.Length, s2, 0, s2.Length, options);
760 public string Source;
766 // Those instances are reused not to invoke instantiation
768 Escape escape1 = new Escape ();
769 Escape escape2 = new Escape ();
771 private int CompareOrdinal (string s1, int idx1, int len1,
772 string s2, int idx2, int len2)
774 int min = len1 < len2 ? len1 : len2;
775 int end1 = idx1 + min;
776 int end2 = idx2 + min;
777 for (int i1 = idx1, i2 = idx2;
778 i1 < end1 && i2 < end2; i1++, i2++)
779 if (s1 [i1] != s2 [i2])
780 return s1 [i1] - s2 [i2];
781 return len1 == len2 ? 0 :
782 len1 == min ? - 1 : 1;
785 public int Compare (string s1, int idx1, int len1,
786 string s2, int idx2, int len2, CompareOptions options)
788 // quick equality check
789 if (idx1 == idx2 && len1 == len2 &&
790 Object.ReferenceEquals (s1, s2))
792 // FIXME: this should be done inside Compare() at
794 // int ord = CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
797 if (options == CompareOptions.Ordinal)
798 return CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
800 #if false // stable easy version, depends on GetSortKey().
801 SortKey sk1 = GetSortKey (s1, idx1, len1, options);
802 SortKey sk2 = GetSortKey (s2, idx2, len2, options);
803 byte [] d1 = sk1.KeyData;
804 byte [] d2 = sk2.KeyData;
805 int len = d1.Length > d2.Length ? d2.Length : d1.Length;
806 for (int i = 0; i < len; i++)
807 if (d1 [i] != d2 [i])
808 return d1 [i] < d2 [i] ? -1 : 1;
809 return d1.Length == d2.Length ? 0 : d1.Length < d2.Length ? -1 : 1;
811 SetOptions (options);
813 int ret = CompareInternal (s1, idx1, len1, s2, idx2, len2, (options & CompareOptions.StringSort) != 0, out dummy, out dummy2, true);
814 return ret == 0 ? 0 : ret < 0 ? -1 : 1;
818 int CompareInternal (string s1, int idx1, int len1, string s2,
819 int idx2, int len2, bool stringSort,
820 out bool targetConsumed, out bool sourceConsumed,
821 bool skipHeadingExtenders)
825 int end1 = idx1 + len1;
826 int end2 = idx2 + len2;
827 targetConsumed = false;
828 sourceConsumed = false;
830 // It holds final result that comes from the comparison
831 // at level 2 or lower. Even if Compare() found the
832 // difference at level 2 or lower, it still has to
833 // continue level 1 comparison. FinalResult is used
834 // when there was no further differences.
836 // It holds the comparison level to do. It starts from
837 // 5, and becomes 1 when we do primary-only comparison.
838 int currentLevel = 5;
845 // Skip heading extenders
846 if (skipHeadingExtenders) {
847 for (; idx1 < end1; idx1++)
848 if (GetExtenderType (s1 [idx1]) == ExtenderType.None)
850 for (; idx2 < end2; idx2++)
851 if (GetExtenderType (s2 [idx2]) == ExtenderType.None)
855 ExtenderType ext1 = ExtenderType.None;
856 ExtenderType ext2 = ExtenderType.None;
859 for (; idx1 < end1; idx1++)
860 if (!IsIgnorable (s1 [idx1]))
862 for (; idx2 < end2; idx2++)
863 if (!IsIgnorable (s2 [idx2]))
867 if (escape1.Source == null)
870 start1 = escape1.Start;
871 idx1 = escape1.Index;
873 escape1.Source = null;
877 if (escape2.Source == null)
880 start2 = escape2.Start;
881 idx2 = escape2.Index;
883 escape2.Source = null;
887 // FIXME: optimization could be done here.
889 if (s1 [idx1] == s2 [idx2]) {
894 // while (idx1 >= start1 && !IsSafe ((int) s [idx1]))
896 // while (idx2 >= start2 && !IsSafe ((int) s [idx2]))
904 int i1 = FilterOptions (s1 [idx1]);
905 int i2 = FilterOptions (s2 [idx2]);
906 bool special1 = false;
907 bool special2 = false;
909 // If current character is an expander, then
910 // repeat the previous character.
911 ext1 = GetExtenderType (i1);
912 if (ext1 != ExtenderType.None) {
913 if (previousChar < 0) {
914 if (previousSortKey == null) {
919 sk1 = previousSortKey;
922 i1 = FilterExtender (previousChar, ext1);
924 ext2 = GetExtenderType (i2);
925 if (ext2 != ExtenderType.None) {
926 if (previousChar2 < 0) {
927 if (previousSortKey2 == null) {
932 sk2 = previousSortKey2;
935 i2 = FilterExtender (previousChar2, ext2);
938 byte cat1 = Category (i1);
939 byte cat2 = Category (i2);
941 // Handle special weight characters
942 if (!stringSort && currentLevel > 4) {
944 lv5At1 = escape1.Source != null ?
945 escape1.Index - escape1.Start :
947 // here Windows has a bug that it does
948 // not consider thirtiary weight.
949 lv5Value1 = Level1 (i1) << 8 + Uni.Level3 (i1);
954 lv5At2 = escape2.Source != null ?
955 escape2.Index - escape2.Start :
957 // here Windows has a bug that it does
958 // not consider thirtiary weight.
959 lv5Value2 = Level1 (i2) << 8 + Uni.Level3 (i2);
963 if (cat1 == 6 || cat2 == 6) {
969 Contraction ct1 = null;
970 if (ext1 == ExtenderType.None)
971 ct1 = GetContraction (s1, idx1, end1);
976 else if (ct1 != null) {
977 offset1 = ct1.Source.Length;
978 if (ct1.SortKey != null) {
980 for (int i = 0; i < ct1.SortKey.Length; i++)
981 sk1 [i] = ct1.SortKey [i];
983 previousSortKey = sk1;
985 else if (escape1.Source == null) {
987 escape1.Start = start1;
988 escape1.Index = cur1 + ct1.Source.Length;
990 s1 = ct1.Replacement;
1000 sk1 [1] = Level1 (i1);
1001 if (!ignoreNonSpace && currentLevel > 1)
1002 sk1 [2] = Level2 (i1, ext1);
1003 if (currentLevel > 2)
1004 sk1 [3] = Uni.Level3 (i1);
1005 if (currentLevel > 3)
1006 special1 = Uni.HasSpecialWeight ((char) i1);
1011 Contraction ct2 = null;
1012 if (ext2 == ExtenderType.None)
1013 ct2 = GetContraction (s2, idx2, end2);
1017 else if (ct2 != null) {
1018 idx2 += ct2.Source.Length;
1019 if (ct2.SortKey != null) {
1021 for (int i = 0; i < ct2.SortKey.Length; i++)
1022 sk2 [i] = ct2.SortKey [i];
1024 previousSortKey2 = sk2;
1026 else if (escape2.Source == null) {
1027 escape2.Source = s2;
1028 escape2.Start = start2;
1029 escape2.Index = cur2 + ct2.Source.Length;
1031 s2 = ct2.Replacement;
1041 sk2 [1] = Level1 (i2);
1042 if (!ignoreNonSpace && currentLevel > 1)
1043 sk2 [2] = Level2 (i2, ext2);
1044 if (currentLevel > 2)
1045 sk2 [3] = Uni.Level3 (i2);
1046 if (currentLevel > 3)
1047 special2 = Uni.HasSpecialWeight ((char) i2);
1053 // Add offset here so that it does not skip
1054 // idx1 while s2 has a replacement.
1057 // add diacritical marks in s1 here
1058 if (!ignoreNonSpace) {
1059 while (idx1 < end1) {
1060 if (Category (s1 [idx1]) != 1)
1064 sk1 [2] = (byte) (sk1 [2] +
1065 Level2 (s1 [idx1], ExtenderType.None));
1069 // add diacritical marks in s2 here
1070 while (idx2 < end2) {
1071 if (Category (s2 [idx2]) != 1)
1075 sk2 [2] = (byte) (sk2 [2] +
1076 Level2 (s2 [idx2], ExtenderType.None));
1081 int ret = sk1 [0] - sk2 [0];
1082 ret = ret != 0 ? ret : sk1 [1] - sk2 [1];
1085 if (currentLevel == 1)
1087 if (!ignoreNonSpace) {
1088 ret = sk1 [2] - sk2 [2];
1091 currentLevel = frenchSort ? 2 : 1;
1095 if (currentLevel == 2)
1097 ret = sk1 [3] - sk2 [3];
1103 if (currentLevel == 3)
1105 if (special1 != special2) {
1106 finalResult = special1 ? 1 : -1;
1111 ret = CompareFlagPair (
1112 !Uni.IsJapaneseSmallLetter ((char) i1),
1113 !Uni.IsJapaneseSmallLetter ((char) i2));
1114 ret = ret != 0 ? ret :
1115 ToDashTypeValue (ext1) -
1116 ToDashTypeValue (ext2);
1117 ret = ret != 0 ? ret : CompareFlagPair (
1118 Uni.IsHiragana ((char) i1),
1119 Uni.IsHiragana ((char) i2));
1120 ret = ret != 0 ? ret : CompareFlagPair (
1121 !IsHalfKana ((char) i1),
1122 !IsHalfKana ((char) i2));
1131 // If there were only level 3 or lower differences,
1132 // then we still have to find diacritical differences
1134 if (!ignoreNonSpace &&
1135 finalResult != 0 && currentLevel > 2) {
1136 while (idx1 < end1 && idx2 < end2) {
1137 if (!Uni.IsIgnorableNonSpacing (s1 [idx1]))
1139 if (!Uni.IsIgnorableNonSpacing (s2 [idx2]))
1141 finalResult = Level2 (FilterOptions ((s1 [idx1])), ext1) - Level2 (FilterOptions (s2 [idx2]), ext2);
1142 if (finalResult != 0)
1146 // they should work only for the first character
1147 ext1 = ExtenderType.None;
1148 ext2 = ExtenderType.None;
1151 // we still have to handle level 5
1152 if (finalResult == 0) {
1153 finalResult = lv5At1 - lv5At2;
1154 if (finalResult == 0)
1155 finalResult = lv5Value1 - lv5Value2;
1157 if (finalResult == 0) {
1159 targetConsumed = true;
1161 sourceConsumed = true;
1163 return idx1 != end1 ? 1 : idx2 == end2 ? finalResult : -1;
1168 #region IsPrefix() and IsSuffix()
1170 public bool IsPrefix (string src, string target, CompareOptions opt)
1172 return IsPrefix (src, target, 0, src.Length, opt);
1175 public bool IsPrefix (string s, string target, int start, int length, CompareOptions opt)
1178 return IsPrefix (s, target, start, length,
1179 (opt & CompareOptions.StringSort) != 0, true);
1182 public bool IsPrefix (string s, string target, int start, int length, bool stringSort, bool skipHeadingExtenders)
1184 bool consumed, dummy;
1185 int ret = CompareInternal (s, start, length,
1186 target, 0, target.Length, stringSort,
1187 out consumed, out dummy, skipHeadingExtenders);
1193 public bool IsSuffix (string src, string target, CompareOptions opt)
1195 return IsSuffix (src, target, src.Length - 1, src.Length, opt);
1198 public bool IsSuffix (string s, string target, int start, int length, CompareOptions opt)
1200 // quick check : simple codepoint comparison
1201 if (s.Length >= target.Length) {
1203 int se = start - length;
1204 for (int i = target.Length - 1; si >= se && i >= 0; i--, si--)
1205 if (s [si] != target [i])
1207 if (si == start + target.Length)
1212 return IsSuffix (s, target, start, length,
1213 (opt & CompareOptions.StringSort) != 0);
1216 bool New_IsSuffix (string s, string t, int start, int length,
1220 for (;tstart < t.Length; tstart++)
1221 if (!IsIgnorable (t [tstart]))
1223 if (tstart == t.Length)
1224 return true; // as if target is String.Empty.
1226 // FIXME: could be more efficient.
1227 bool consumed, dummy;
1228 for (int i = 0; i < length; i++) {
1229 int ret = CompareInternal (s, start - i, i + 1,
1230 t, tstart, t.Length, stringSort,
1231 out dummy, out consumed, true);
1238 // FIXME: This code has a fatal problem that it cannot handle
1240 bool IsSuffix (string s, string t, int start, int length,
1244 int ti = t.Length - 1;
1245 int end = start - length + 1;
1250 if (escape2.Source == null)
1255 escape2.Source = null;
1258 if (IsIgnorable (t [ti])) {
1263 if (escape1.Source == null)
1268 escape1.Source = null;
1271 if (IsIgnorable (s [si])) {
1276 ExtenderType ext1 = GetExtenderType (s [si]);
1277 ExtenderType ext2 = GetExtenderType (t [ti]);
1281 // Check contraction for target.
1282 Contraction ctt = null;
1283 if (ext2 != ExtenderType.None) {
1284 if (previousChar2 < 0)
1285 sk = previousSortKey2;
1287 vt = FilterExtender (previousChar2, ext2);
1289 else if (escape2.Source == null)
1290 ctt = GetTailContraction (t, ti, 0);
1292 if (ctt.SortKey != null) {
1293 ti -= ctt.Source.Length;
1294 for (int i = 0; i < ctt.SortKey.Length; i++)
1295 charSortKey2 [i] = ctt.SortKey [i];
1297 previousSortKey2 = charSortKey2;
1300 escape2.Index = ti - ctt.Source.Length;
1302 t = ctt.Replacement;
1303 ti = ctt.Replacement.Length - 1;
1307 } else if (sk == null) {
1309 vt = FilterOptions (t [ti]);
1311 sk [0] = Category (vt);
1312 sk [1] = Level1 (vt);
1313 if (!ignoreNonSpace)
1314 // FIXME: pass extender type
1315 sk [2] = Level2 (vt, ext2);
1316 sk [3] = Uni.Level3 (vt);
1321 if (!MatchesBackward (ref s, ref si, ref end, vt, sk, !Uni.HasSpecialWeight ((char) vt)))
1325 // All codepoints in the compared range
1326 // matches. In that case, what matters
1327 // is whether the remaining part of
1328 // "target" is ignorable or not.
1330 if (!IsIgnorable (t [ti--]))
1337 // WARNING: Don't invoke it outside IsSuffix().
1338 // returns consumed source length (mostly 1, source length in case of contraction)
1339 bool MatchesBackward (ref string s, ref int idx, ref int end, int it, byte [] sortkey, bool noLv4)
1341 Contraction ct = null;
1342 // If there is already expansion, then it should not
1343 // process further expansions.
1344 if (escape1.Source == null)
1345 ct = GetTailContraction (s, idx, end);
1347 if (ct.SortKey != null) {
1350 for (int i = 0; i < ct.SortKey.Length; i++)
1351 if (sortkey [i] != ct.SortKey [i])
1353 idx -= ct.Source.Length;
1357 escape1.Index = idx - ct.Source.Length;
1362 return MatchesBackward (ref s, ref idx, ref end, it, sortkey, noLv4);
1365 // primitive comparison
1366 if (!MatchesPrimitive (s [idx], it, sortkey))
1373 // Common use methods
1375 // returns comparison result.
1376 private bool MatchesPrimitive (char src, int ct, byte [] sortkey)
1378 // char-by-char comparison.
1379 int cs = FilterOptions (src);
1383 if (Category (cs) != sortkey [0] ||
1384 Level1 (cs) != sortkey [1] ||
1385 !ignoreNonSpace && Level2 (cs, ExtenderType.None) != sortkey [2] ||
1386 Uni.Level3 (cs) != sortkey [3])
1388 // lv.4 (only when required). No need to check cj coz
1389 // there is no pair of characters that has the same
1390 // primary level and differs here.
1391 if (!Uni.HasSpecialWeight (src))
1393 char target = (char) ct;
1394 return Uni.IsJapaneseSmallLetter (src) !=
1395 Uni.IsJapaneseSmallLetter (target) ||
1396 Uni.GetJapaneseDashType (src) !=
1397 Uni.GetJapaneseDashType (target) ||
1398 Uni.IsHiragana (src) !=
1399 Uni.IsHiragana (target) ||
1400 IsHalfKana (src) != IsHalfKana (target);
1403 int CompareFlagPair (bool b1, bool b2)
1405 return b1 == b2 ? 0 : b1 ? 1 : -1;
1410 #region IndexOf() / LastIndexOf()
1412 // IndexOf (string, string, CompareOptions)
1413 // IndexOf (string, string, int, int, CompareOptions)
1414 // IndexOf (string, char, int, int, CompareOptions)
1415 // IndexOfPrimitiveChar (string, int, int, char)
1416 // IndexOfSortKey (string, int, int, byte[], char, int, bool)
1417 // IndexOf (string, string, int, int)
1419 public int IndexOf (string s, string target, CompareOptions opt)
1421 return IndexOf (s, target, 0, s.Length, opt);
1424 public int IndexOf (string s, string target, int start, int length, CompareOptions opt)
1427 return IndexOf (s, target, start, length,
1428 (opt & CompareOptions.StringSort) != 0);
1431 public int IndexOf (string s, char target, CompareOptions opt)
1433 return IndexOf (s, target, 0, s.Length, opt);
1436 public int IndexOf (string s, char target, int start, int length, CompareOptions opt)
1440 // If target is contraction, then use string search.
1441 Contraction ct = GetContraction (target);
1443 if (ct.Replacement != null)
1444 return IndexOf (s, ct.Replacement, start, length,
1445 (opt & CompareOptions.StringSort) != 0);
1447 return IndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true);
1450 return IndexOfPrimitiveChar (s, start, length, target);
1453 // Searches target char w/o checking contractions
1454 int IndexOfPrimitiveChar (string s, int start, int length, char target)
1456 int ti = FilterOptions ((int) target);
1457 charSortKeyIndexTarget [0] = Category (ti);
1458 charSortKeyIndexTarget [1] = Level1 (ti);
1459 if (!ignoreNonSpace)
1460 charSortKeyIndexTarget [2] = Level2 (ti, ExtenderType.None);
1461 charSortKeyIndexTarget [3] = Uni.Level3 (ti);
1462 return IndexOfSortKey (s, start, length, charSortKeyIndexTarget, target, ti, !Uni.HasSpecialWeight ((char) ti));
1465 // Searches target byte[] keydata
1466 int IndexOfSortKey (string s, int start, int length, byte [] sortkey, char target, int ti, bool noLv4)
1468 int end = start + length;
1472 if (MatchesForward (s, ref idx, end, ti, sortkey, noLv4))
1478 // Searches string. Search head character (or keydata when
1479 // the head is contraction sortkey) and try IsPrefix().
1480 int IndexOf (string s, string target, int start, int length, bool stringSort)
1483 for (; tidx < target.Length; tidx++)
1484 if (!IsIgnorable (target [tidx]))
1486 if (tidx == target.Length)
1488 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1489 string replace = ct != null ? ct.Replacement : null;
1490 byte [] sk = replace == null ? charSortKeyIndexTarget : null;
1492 char tc = char.MinValue;
1494 if (ct != null && sk != null) {
1495 for (int i = 0; i < ct.SortKey.Length; i++)
1496 sk [i] = ct.SortKey [i];
1497 } else if (sk != null) {
1499 ti = FilterOptions (target [tidx]);
1500 sk [0] = Category (ti);
1501 sk [1] = Level1 (ti);
1502 if (!ignoreNonSpace)
1503 sk [2] = Level2 (ti, ExtenderType.None);
1504 sk [3] = Uni.Level3 (ti);
1505 noLv4 = !Uni.HasSpecialWeight ((char) ti);
1508 for (tidx++; tidx < target.Length; tidx++) {
1509 if (Category (target [tidx]) != 1)
1513 sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None));
1519 if (replace != null)
1520 idx = IndexOf (s, replace, start, length, stringSort);
1522 idx = IndexOfSortKey (s, start, length, sk, tc, ti, noLv4);
1525 length -= idx - start;
1527 if (IsPrefix (s, target, start, length, stringSort, false))
1529 Contraction cts = GetContraction (s, start, length);
1531 start += cts.Source.Length;
1532 length -= cts.Source.Length;
1537 } while (length > 0);
1542 // There are the same number of IndexOf() related methods,
1543 // with the same functionalities for each.
1546 public int LastIndexOf (string s, string target, CompareOptions opt)
1548 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1551 public int LastIndexOf (string s, string target, int start, int length, CompareOptions opt)
1554 return LastIndexOf (s, target, start, length,
1555 (opt & CompareOptions.StringSort) != 0);
1558 public int LastIndexOf (string s, char target, CompareOptions opt)
1560 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1563 public int LastIndexOf (string s, char target, int start, int length, CompareOptions opt)
1567 // If target is contraction, then use string search.
1568 Contraction ct = GetContraction (target);
1570 if (ct.Replacement != null)
1571 return LastIndexOf (s, ct.Replacement, start, length, (opt & CompareOptions.StringSort) != 0);
1573 return LastIndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true);
1576 return LastIndexOfPrimitiveChar (s, start, length, target);
1579 // Searches target char w/o checking contractions
1580 int LastIndexOfPrimitiveChar (string s, int start, int length, char target)
1582 int ti = FilterOptions ((int) target);
1583 charSortKey [0] = Category (ti);
1584 charSortKey [1] = Level1 (ti);
1585 if (!ignoreNonSpace)
1586 // FIXME: pass ExtenderType
1587 charSortKey [2] = Level2 (ti, ExtenderType.None);
1588 charSortKey [3] = Uni.Level3 (ti);
1589 return LastIndexOfSortKey (s, start, length, charSortKey, target, ti, !Uni.HasSpecialWeight ((char) ti));
1592 // Searches target byte[] keydata
1593 int LastIndexOfSortKey (string s, int start, int length, byte [] sortkey, char target, int ti, bool noLv4)
1595 int end = start - length;
1597 for (int idx = start; idx > end; idx--) {
1599 if (Matches (s, ref idx, end, ti, target, sortkey, noLv4, true))
1605 // Searches string. Search head character (or keydata when
1606 // the head is contraction sortkey) and try IsPrefix().
1607 int LastIndexOf (string s, string target, int start, int length, bool stringSort)
1609 int orgStart = start;
1611 for (; tidx < target.Length; tidx++)
1612 if (!IsIgnorable (target [tidx]))
1614 if (tidx == target.Length)
1616 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1617 byte [] sortkey = ct != null ? ct.SortKey : null;
1618 string replace = ct != null ? ct.Replacement : null;
1622 if (sortkey != null)
1623 idx = LastIndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true);
1624 else if (replace != null)
1625 idx = LastIndexOf (s, replace, start, length, stringSort);
1627 idx = LastIndexOfPrimitiveChar (s, start, length, target [tidx]);
1631 if (IsPrefix (s, target, idx, orgStart - idx + 1, stringSort, false)) {
1632 for (;idx < orgStart; idx++)
1633 if (!IsIgnorable (s [idx]))
1637 Contraction cts = GetContraction (s, idx, orgStart - idx + 1);
1639 start -= cts.Source.Length;
1640 length -= cts.Source.Length;
1645 } while (length > 0);
1649 private bool Matches (string s, ref int idx, int end, int ti, char target, byte [] sortkey, bool noLv4, bool lastIndexOf)
1651 switch (char.GetUnicodeCategory (s [idx])) {
1652 case UnicodeCategory.PrivateUse:
1653 case UnicodeCategory.Surrogate:
1654 if (s [idx] != target)
1659 char sc = char.MinValue;
1660 Contraction ct = GetContraction (s, idx, end);
1661 // if lv4 exists, it never matches contraction
1662 if (ct != null && noLv4) {
1664 idx -= ct.Source.Length - 1;
1666 idx += ct.Source.Length - 1;
1667 if (ct.SortKey != null) {
1668 for (int i = 0; i < sortkey.Length; i++)
1669 if (ct.SortKey [i] != sortkey [i])
1673 // Here is the core of LAMESPEC
1674 // described at the top of the source.
1675 sc = ct.Replacement [0];
1682 int si = FilterOptions ((int) sc);
1683 if (Category (si) != sortkey [0] ||
1684 Level1 (si) != sortkey [1] ||
1685 // FIXME: pass ExtenderType
1686 !ignoreNonSpace && Level2 (si, ExtenderType.None) != sortkey [2] ||
1687 Uni.Level3 (si) != sortkey [3])
1689 if (noLv4 && !Uni.HasSpecialWeight ((char) si))
1693 if (Uni.IsJapaneseSmallLetter ((char) si) !=
1694 Uni.IsJapaneseSmallLetter ((char) ti) ||
1695 Uni.GetJapaneseDashType ((char) si) !=
1696 Uni.GetJapaneseDashType ((char) ti) ||
1697 !Uni.IsHiragana ((char) si) !=
1698 !Uni.IsHiragana ((char) ti) ||
1699 IsHalfKana ((char) si) !=
1700 IsHalfKana ((char) ti))
1705 private bool MatchesForward (string s, ref int idx, int end, int ti, byte [] sortkey, bool noLv4)
1708 ExtenderType ext = GetExtenderType (s [idx]);
1709 Contraction ct = null;
1710 if (ext == ExtenderType.None)
1711 ct = GetContraction (s, idx, end);
1712 else if (previousChar < 0)
1713 charSortKey = previousSortKey;
1715 si = FilterExtender (previousChar, ext);
1716 // if lv4 exists, it never matches contraction
1718 idx += ct.Source.Length;
1721 if (ct.SortKey != null) {
1722 for (int i = 0; i < sortkey.Length; i++)
1723 charSortKey [i] = sortkey [i];
1725 previousSortKey = charSortKey;
1727 // Here is the core of LAMESPEC
1728 // described at the top of the source.
1730 return MatchesForward (ct.Replacement, ref dummy,
1731 ct.Replacement.Length, ti, sortkey, noLv4);
1735 si = FilterOptions (s [idx]);
1736 charSortKey [0] = Category (si);
1737 charSortKey [1] = Level1 (si);
1738 if (!ignoreNonSpace)
1739 charSortKey [2] = Level2 (si, ext);
1740 charSortKey [3] = Uni.Level3 (si);
1741 if (charSortKey [0] != 1)
1745 for (; idx < end; idx++) {
1746 if (Category (s [idx]) != 1)
1750 if (charSortKey [2] == 0)
1751 charSortKey [2] = 2;
1752 charSortKey [2] = (byte) (charSortKey [2]
1753 + Level2 (s [idx], ExtenderType.None));
1756 if (charSortKey [0] != sortkey [0] ||
1757 charSortKey [1] != sortkey [1] ||
1758 (!ignoreNonSpace && charSortKey [2] != sortkey [2]) ||
1759 charSortKey [3] != sortkey [3])
1761 if (noLv4 && (si < 0 || !Uni.HasSpecialWeight ((char) si)))
1765 // Since target can never be an extender, if the source
1766 // is an expander and it matters, then they never match.
1767 if (!ignoreNonSpace && ext == ExtenderType.Conditional)
1769 if (Uni.IsJapaneseSmallLetter ((char) si) !=
1770 Uni.IsJapaneseSmallLetter ((char) ti) ||
1771 Uni.GetJapaneseDashType ((char) si) !=
1772 Uni.GetJapaneseDashType ((char) ti) ||
1773 !Uni.IsHiragana ((char) si) !=
1774 !Uni.IsHiragana ((char) ti) ||
1775 IsHalfKana ((char) si) !=
1776 IsHalfKana ((char) ti))