4 // This class will demonstrate CompareInfo functionality that will just work.
8 // Here's a summary for supporting contractions, expansions and diacritical
11 // Diacritical mapping is a simple tailoring rule that "remaps" diacritical
12 // weight value from one to another. For now it applies to all range of
13 // characters, but at some stage we might need to limit the remapping ranges.
15 // A Contraction consists of a string (actually char[]) and a sortkey entry
16 // (i.e. byte[]). It indicates that a sequence of characters is interpreted
17 // as a single character that has the mapped sortkey value. There is no
18 // character which goes across "special rules". When the collator encountered
19 // such a sequence of characters, it just returns the sortkey value without
20 // further replacement.
22 // Since it is still likely to happen that a contraction sequence matches
23 // other character than the identical sequence (depending on CompareOptions
24 // and of course, defined sortkey value itself), comparison cannot be done
25 // at source char[] level.
31 // In IndexOf(), the first character in the target string or the target char
32 // itself is turned into sortkey bytes. If the character has a contraction and
33 // that is sortkey map, then it is used instead. If the contraction exists and
34 // that is replacement map, then the first character of the replacement string
35 // is searched instead. IndexOf() always searches only for the top character,
36 // and if it returned negative value, then it returns -1. Otherwise, it then
37 // tries IsPrefix() from that location. If it returns true, then it returns
41 // LAMESPEC: IndexOf() is lame as a whole API. It never matches in the middle
42 // of expansion and there is no proper way to return such indexes within
43 // a single int return value.
45 // For example, try below in .NET:
46 // IndexOf("\u00E6", "a")
47 // IndexOf("\u00E6", "e")
52 using System.Collections;
53 using System.Globalization;
55 using Uni = Mono.Globalization.Unicode.MSCompatUnicodeTable;
56 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
58 namespace Mono.Globalization.Unicode
60 internal class SimpleCollator
62 static SimpleCollator invariant =
63 new SimpleCollator (CultureInfo.InvariantCulture);
65 internal static readonly byte [] ignorableFlags =
67 internal static readonly byte [] categories =
69 internal static readonly byte [] level1 =
71 internal static readonly byte [] level2 =
73 internal static readonly byte [] level3 =
75 internal static readonly ushort [] widthCompat =
77 internal static readonly CodePointIndexer categoryIndexer =
79 internal static readonly CodePointIndexer lv1Indexer =
81 internal static readonly CodePointIndexer lv2Indexer =
83 internal static readonly CodePointIndexer lv3Indexer =
85 internal static readonly CodePointIndexer widthIndexer =
90 // CompareOptions expanded.
91 bool ignoreNonSpace; // used in IndexOf()
96 TextInfo textInfo; // for ToLower().
98 readonly ushort [] cjkTable;
99 readonly CodePointIndexer cjkIndexer;
100 readonly byte [] cjkLv2Table;
101 readonly CodePointIndexer cjkLv2Indexer;
103 readonly Contraction [] contractions;
104 readonly Level2Map [] level2Maps;
106 // temporary sortkey buffer for index search/comparison
107 byte [] charSortKey = new byte [4];
108 byte [] charSortKey2 = new byte [4];
109 byte [] charSortKeyIndexTarget = new byte [4];
111 #region Tailoring support classes
112 // Possible mapping types are:
114 // - string to string (ReplacementMap)
115 // - string to SortKey (SortKeyMap)
116 // - diacritical byte to byte (DiacriticalMap)
118 // There could be mapping from string to sortkeys, but
119 // for now there is none as such.
121 internal class Contraction
123 public readonly char [] Source;
124 // only either of them is used.
125 public readonly string Replacement;
126 public readonly byte [] SortKey;
128 public Contraction (char [] source,
129 string replacement, byte [] sortkey)
132 Replacement = replacement;
137 internal class ContractionComparer : IComparer
139 public static readonly ContractionComparer Instance =
140 new ContractionComparer ();
142 public int Compare (object o1, object o2)
144 Contraction c1 = (Contraction) o1;
145 Contraction c2 = (Contraction) o2;
146 char [] a1 = c1.Source;
147 char [] a2 = c2.Source;
148 int min = a1.Length > a2.Length ?
149 a2.Length : a1.Length;
150 for (int i = 0; i < min; i++)
151 if (a1 [i] != a2 [i])
152 return a1 [i] - a2 [i];
153 return a1.Length - a2.Length;
157 internal class Level2Map
162 public Level2Map (byte source, byte replace)
169 internal class Level2MapComparer : IComparer
171 public static readonly Level2MapComparer Instance =
172 new Level2MapComparer ();
174 public int Compare (object o1, object o2)
176 Level2Map m1 = (Level2Map) o1;
177 Level2Map m2 = (Level2Map) o2;
178 return (m1.Source - m2.Source);
184 #region .ctor() and split functions
186 public SimpleCollator (CultureInfo culture)
189 textInfo = culture.TextInfo;
190 buf = new SortKeyBuffer (culture.LCID);
192 SetCJKTable (culture, ref cjkTable, ref cjkIndexer,
193 ref cjkLv2Table, ref cjkLv2Indexer);
195 // Get tailoring info
196 TailoringInfo t = null;
197 for (CultureInfo ci = culture; ci.LCID != 127; ci = ci.Parent) {
198 t = Uni.GetTailoringInfo (ci.LCID);
202 if (t == null) // then use invariant
203 t = Uni.GetTailoringInfo (127);
205 frenchSort = t.FrenchSort;
206 BuildTailoringTables (culture, t, ref contractions,
208 // FIXME: Since tailorings are mostly for latin
209 // (and in some cases Cyrillic) characters, it would
210 // be much better for performance to store "start
211 // indexes" for > 370 (culture-specific letters).
214 // dump tailoring table
215 Console.WriteLine ("******** building table for {0} : c - {1} d - {2}",
216 culture.LCID, contractions.Length, level2Maps.Length);
217 foreach (Contraction c in contractions) {
218 foreach (char cc in c.Source)
219 Console.Write ("{0:X4} ", (int) cc);
220 Console.WriteLine (" -> '{0}'", c.Replacement);
225 private void BuildTailoringTables (CultureInfo culture,
227 ref Contraction [] contractions,
228 ref Level2Map [] diacriticals)
230 // collect tailoring entries.
231 ArrayList cmaps = new ArrayList ();
232 ArrayList dmaps = new ArrayList ();
233 char [] tarr = Uni.TailoringValues;
234 int idx = t.TailoringIndex;
235 int end = idx + t.TailoringCount;
239 switch (tarr [idx]) {
240 case '\x1': // SortKeyMap
242 while (tarr [ss] != 0)
244 src = new char [ss - idx];
245 Array.Copy (tarr, idx, src, 0, ss - idx);
246 byte [] sortkey = new byte [4];
247 for (int i = 0; i < 4; i++)
248 sortkey [i] = (byte) tarr [ss + 1 + i];
249 cmaps.Add (new Contraction (
250 src, null, sortkey));
254 case '\x2': // DiacriticalMap
255 dmaps.Add (new Level2Map (
256 (byte) tarr [idx + 1],
257 (byte) tarr [idx + 2]));
260 case '\x3': // ReplacementMap
262 while (tarr [ss] != 0)
264 src = new char [ss - idx];
265 Array.Copy (tarr, idx, src, 0, ss - idx);
268 while (tarr [l] != 0)
270 string r = new string (tarr, ss, l - ss);
271 cmaps.Add (new Contraction (
276 throw new NotImplementedException (String.Format ("Mono INTERNAL ERROR (Should not happen): Collation tailoring table is broken for culture {0} ({1}) at 0x{2:X}", culture.LCID, culture.Name, idx));
279 cmaps.Sort (ContractionComparer.Instance);
280 dmaps.Sort (Level2MapComparer.Instance);
281 contractions = cmaps.ToArray (typeof (Contraction))
283 diacriticals = dmaps.ToArray (typeof (Level2Map))
287 private void SetCJKTable (CultureInfo culture,
288 ref ushort [] cjkTable, ref CodePointIndexer cjkIndexer,
289 ref byte [] cjkLv2Table, ref CodePointIndexer cjkLv2Indexer)
291 string name = GetNeutralCulture (culture).Name;
295 // custom CJK table support.
298 cjkTable = Uni.CjkCHS;
299 cjkIndexer = UUtil.CjkCHS;
302 cjkTable = Uni.CjkCHT;
303 cjkIndexer = UUtil.Cjk;
306 cjkTable = Uni.CjkJA;
307 cjkIndexer = UUtil.Cjk;
310 cjkTable = Uni.CjkKO;
311 cjkLv2Table = Uni.CjkKOLv2;
312 cjkIndexer = UUtil.Cjk;
313 cjkLv2Indexer = UUtil.Cjk;
318 static CultureInfo GetNeutralCulture (CultureInfo info)
320 CultureInfo ret = info;
321 while (ret.Parent != null && ret.Parent.LCID != 127)
328 byte Category (int cp)
330 if (cp < 0x3000 || cjkTable == null)
331 return categories [categoryIndexer.ToIndex (cp)];
332 int idx = cjkIndexer.ToIndex (cp);
333 ushort cjk = idx < 0 ? (ushort) 0 : cjkTable [idx];
334 return cjk != 0 ? (byte) ((cjk & 0xFF00) >> 8) :
335 categories [categoryIndexer.ToIndex (cp)];
340 if (cp < 0x3000 || cjkTable == null)
341 return level1 [lv1Indexer.ToIndex (cp)];
342 int idx = cjkIndexer.ToIndex (cp);
343 ushort cjk = idx < 0 ? (ushort) 0 : cjkTable [idx];
344 return cjk != 0 ? (byte) (cjk & 0xFF) :
345 level1 [lv1Indexer.ToIndex (cp)];
348 byte Level2 (int cp, ExtenderType ext)
350 if (ext == ExtenderType.Buggy)
352 else if (ext == ExtenderType.Conditional)
355 if (cp < 0x3000 || cjkLv2Table == null)
356 return level2 [lv2Indexer.ToIndex (cp)];
357 int idx = cjkLv2Indexer.ToIndex (cp);
358 byte ret = idx < 0 ? (byte) 0 : cjkLv2Table [idx];
361 ret = level2 [lv2Indexer.ToIndex (cp)];
362 if (level2Maps.Length == 0)
364 for (int i = 0; i < level2Maps.Length; i++) {
365 if (level2Maps [i].Source == ret)
366 return level2Maps [i].Replace;
367 else if (level2Maps [i].Source > ret)
373 bool IsHalfKana (int cp)
375 return ignoreWidth || Uni.IsHalfWidthKana ((char) cp);
378 void SetOptions (CompareOptions options)
380 this.ignoreNonSpace = (options & CompareOptions.IgnoreNonSpace) != 0;
381 this.ignoreSymbols = (options & CompareOptions.IgnoreSymbols) != 0;
382 this.ignoreWidth = (options & CompareOptions.IgnoreWidth) != 0;
383 this.ignoreCase = (options & CompareOptions.IgnoreCase) != 0;
384 this.ignoreKanaType = (options & CompareOptions.IgnoreKanaType) != 0;
385 previousChar = previousChar2 = -1;
386 previousSortKey = previousSortKey2 = null;
387 escape1.Source = escape2.Source = null;
390 Contraction GetContraction (string s, int start, int end)
392 Contraction c = GetContraction (s, start, end, contractions);
393 if (c != null || lcid == 127)
395 return GetContraction (s, start, end, invariant.contractions);
398 Contraction GetContraction (string s, int start, int end, Contraction [] clist)
400 for (int i = 0; i < clist.Length; i++) {
401 Contraction ct = clist [i];
402 int diff = ct.Source [0] - s [start];
404 return null; // it's already sorted
407 char [] chars = ct.Source;
408 if (end - start < chars.Length)
411 for (int n = 0; n < chars.Length; n++)
412 if (s [start + n] != chars [n]) {
422 Contraction GetTailContraction (string s, int start, int end)
424 Contraction c = GetTailContraction (s, start, end, contractions);
425 if (c != null || lcid == 127)
427 return GetTailContraction (s, start, end, invariant.contractions);
430 Contraction GetTailContraction (string s, int start, int end, Contraction [] clist)
432 for (int i = 0; i < clist.Length; i++) {
433 Contraction ct = clist [i];
434 int diff = ct.Source [0] - s [end];
436 return null; // it's already sorted
439 char [] chars = ct.Source;
440 if (start - end + 1 < chars.Length)
443 int offset = start - chars.Length + 1;
444 for (int n = 0; n < chars.Length; n++)
445 if (s [offset + n] != chars [n]) {
455 Contraction GetContraction (char c)
457 Contraction ct = GetContraction (c, contractions);
458 if (ct != null || lcid == 127)
460 return GetContraction (c, invariant.contractions);
463 Contraction GetContraction (char c, Contraction [] clist)
465 for (int i = 0; i < clist.Length; i++) {
466 Contraction ct = clist [i];
467 if (ct.Source [0] > c)
468 return null; // it's already sorted
469 if (ct.Source [0] == c && ct.Source.Length == 1)
475 int FilterOptions (int i)
478 int x = widthCompat [widthIndexer.ToIndex (i)];
483 i = textInfo.ToLower ((char) i);
485 i = Uni.ToKanaTypeInsensitive (i);
489 int previousChar = -1;
490 byte [] previousSortKey = null;
491 int previousChar2 = -1;
492 byte [] previousSortKey2 = null;
502 ExtenderType GetExtenderType (int i)
504 // LAMESPEC: Windows expects true for U+3005, but
505 // sometimes it does not represent to repeat just
507 // Windows also expects true for U+3031 and U+3032,
508 // but they should *never* repeat one character.
510 if (i < 0x3005 || i > 0xFF70)
511 return ExtenderType.None;
516 return ExtenderType.Simple;
518 return ExtenderType.Conditional;
521 return ExtenderType.Voiced;
525 return ExtenderType.None;
527 case 0x3005: // LAMESPEC: see above.
528 return ExtenderType.Buggy;
529 case 0x3031: // LAMESPEC: see above.
530 case 0x3032: // LAMESPEC: see above.
533 return ExtenderType.Simple;
536 return ExtenderType.Voiced;
538 return ExtenderType.Conditional;
540 return ExtenderType.None;
544 byte ToDashTypeValue (ExtenderType ext)
546 if (ignoreNonSpace) // LAMESPEC: huh, why?
549 case ExtenderType.None:
551 case ExtenderType.Conditional:
558 int FilterExtender (int i, ExtenderType ext)
560 if (ext == ExtenderType.Conditional &&
561 Uni.HasSpecialWeight ((char) i)) {
562 bool half = IsHalfKana ((char) i);
563 bool katakana = !Uni.IsHiragana ((char) i);
564 switch (Level1 (i) & 7) {
566 return half ? 0xFF71 : katakana ? 0x30A2 : 0x3042;
568 return half ? 0xFF72 : katakana ? 0x30A4 : 0x3044;
570 return half ? 0xFF73 : katakana ? 0x30A6 : 0x3046;
572 return half ? 0xFF74 : katakana ? 0x30A8 : 0x3048;
574 return half ? 0xFF75 : katakana ? 0x30AA : 0x304A;
580 bool IsIgnorable (int i)
582 return Uni.IsIgnorable (i) ||
583 ignoreSymbols && Uni.IsIgnorableSymbol (i) ||
584 ignoreNonSpace && Uni.IsIgnorableNonSpacing (i);
590 public SortKey GetSortKey (string s)
592 return GetSortKey (s, CompareOptions.None);
595 public SortKey GetSortKey (string s, CompareOptions options)
597 return GetSortKey (s, 0, s.Length, options);
600 public SortKey GetSortKey (string s, int start, int length, CompareOptions options)
602 SetOptions (options);
604 buf.Initialize (options, lcid, s, frenchSort);
605 int end = start + length;
606 GetSortKey (s, start, end);
607 return buf.GetResultAndReset ();
610 void GetSortKey (string s, int start, int end)
612 for (int n = start; n < end; n++) {
615 ExtenderType ext = GetExtenderType (i);
616 if (ext != ExtenderType.None) {
617 i = FilterExtender (previousChar, ext);
619 FillSortKeyRaw (i, ext);
620 else if (previousSortKey != null) {
621 byte [] b = previousSortKey;
625 b [2] != 1 ? b [2] : Level2 (i, ext),
626 b [3] != 1 ? b [3] : level3 [lv3Indexer.ToIndex (i)]);
628 // otherwise do nothing.
629 // (if the extender is the first char
630 // in the string, then just ignore.)
636 i = FilterOptions (i);
638 Contraction ct = GetContraction (s, n, end);
640 if (ct.Replacement != null) {
641 GetSortKey (ct.Replacement, 0, ct.Replacement.Length);
643 byte [] b = ct.SortKey;
647 b [2] != 1 ? b [2] : Level2 (i, ext),
648 b [3] != 1 ? b [3] : level3 [lv3Indexer.ToIndex (i)]);
652 n += ct.Source.Length - 1;
655 if (!Uni.IsIgnorableNonSpacing (i))
657 FillSortKeyRaw (i, ExtenderType.None);
662 void FillSortKeyRaw (int i, ExtenderType ext)
664 if (0x3400 <= i && i <= 0x4DB5) {
665 int diff = i - 0x3400;
666 buf.AppendCJKExtension (
667 (byte) (0x10 + diff / 254),
668 (byte) (diff % 254 + 2));
672 UnicodeCategory uc = char.GetUnicodeCategory ((char) i);
674 case UnicodeCategory.PrivateUse:
675 int diff = i - 0xE000;
677 (byte) (0xE5 + diff / 254),
678 (byte) (diff % 254 + 2),
682 case UnicodeCategory.Surrogate:
683 FillSurrogateSortKeyRaw (i);
687 byte level2 = Level2 (i, ext);
688 if (Uni.HasSpecialWeight ((char) i)) {
689 byte level1 = Level1 (i);
695 Uni.IsJapaneseSmallLetter ((char) i),
696 ToDashTypeValue (ext),
697 !Uni.IsHiragana ((char) i),
698 IsHalfKana ((char) i)
700 if (!ignoreNonSpace && ext == ExtenderType.Voiced)
701 // Append voice weight
702 buf.AppendNormal (1, 1, 1, 0);
712 void FillSurrogateSortKeyRaw (int i)
721 lower = (byte) ((i == 0xD800) ? 0x3E : 0x3F);
722 } else if (0xD840 <= i && i < 0xD880) {
726 } else if (0xDB80 <= i && i < 0xDC00) {
727 diffbase = 0xDB80 - 0x40;
731 diffbase = 0xDC00 - 0xF8 + 2;
735 int diff = i - diffbase;
738 (byte) (segment + diff / 254),
739 (byte) (diff % 254 + 2),
748 public int Compare (string s1, string s2)
750 return Compare (s1, s2, CompareOptions.None);
753 public int Compare (string s1, string s2, CompareOptions options)
755 return Compare (s1, 0, s1.Length, s2, 0, s2.Length, options);
760 public string Source;
766 // Those instances are reused not to invoke instantiation
768 Escape escape1 = new Escape ();
769 Escape escape2 = new Escape ();
771 private int CompareOrdinal (string s1, int idx1, int len1,
772 string s2, int idx2, int len2)
774 int min = len1 < len2 ? len1 : len2;
775 int end1 = idx1 + min;
776 int end2 = idx2 + min;
777 for (int i1 = idx1, i2 = idx2;
778 i1 < end1 && i2 < end2; i1++, i2++)
779 if (s1 [i1] != s2 [i2])
780 return s1 [i1] - s2 [i2];
781 return len1 == len2 ? 0 :
782 len1 == min ? - 1 : 1;
785 public int Compare (string s1, int idx1, int len1,
786 string s2, int idx2, int len2, CompareOptions options)
788 // quick equality check
789 if (idx1 == idx2 && len1 == len2 &&
790 Object.ReferenceEquals (s1, s2))
792 // FIXME: this should be done inside Compare() at
794 // int ord = CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
797 if (options == CompareOptions.Ordinal)
798 return CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
800 #if false // stable easy version, depends on GetSortKey().
801 SortKey sk1 = GetSortKey (s1, idx1, len1, options);
802 SortKey sk2 = GetSortKey (s2, idx2, len2, options);
803 byte [] d1 = sk1.KeyData;
804 byte [] d2 = sk2.KeyData;
805 int len = d1.Length > d2.Length ? d2.Length : d1.Length;
806 for (int i = 0; i < len; i++)
807 if (d1 [i] != d2 [i])
808 return d1 [i] < d2 [i] ? -1 : 1;
809 return d1.Length == d2.Length ? 0 : d1.Length < d2.Length ? -1 : 1;
811 SetOptions (options);
813 int ret = CompareInternal (s1, idx1, len1, s2, idx2, len2, (options & CompareOptions.StringSort) != 0, out dummy, out dummy2, true);
814 return ret == 0 ? 0 : ret < 0 ? -1 : 1;
818 int CompareInternal (string s1, int idx1, int len1, string s2,
819 int idx2, int len2, bool stringSort,
820 out bool targetConsumed, out bool sourceConsumed,
821 bool skipHeadingExtenders)
825 int end1 = idx1 + len1;
826 int end2 = idx2 + len2;
827 targetConsumed = false;
828 sourceConsumed = false;
830 // It holds final result that comes from the comparison
831 // at level 2 or lower. Even if Compare() found the
832 // difference at level 2 or lower, it still has to
833 // continue level 1 comparison. FinalResult is used
834 // when there was no further differences.
836 // It holds the comparison level to do. It starts from
837 // 5, and becomes 1 when we do primary-only comparison.
838 int currentLevel = 5;
845 // Skip heading extenders
846 if (skipHeadingExtenders) {
847 for (; idx1 < end1; idx1++)
848 if (GetExtenderType (s1 [idx1]) == ExtenderType.None)
850 for (; idx2 < end2; idx2++)
851 if (GetExtenderType (s2 [idx2]) == ExtenderType.None)
855 ExtenderType ext1 = ExtenderType.None;
856 ExtenderType ext2 = ExtenderType.None;
859 for (; idx1 < end1; idx1++)
860 if (!IsIgnorable (s1 [idx1]))
862 for (; idx2 < end2; idx2++)
863 if (!IsIgnorable (s2 [idx2]))
867 if (escape1.Source == null)
870 start1 = escape1.Start;
871 idx1 = escape1.Index;
873 escape1.Source = null;
877 if (escape2.Source == null)
880 start2 = escape2.Start;
881 idx2 = escape2.Index;
883 escape2.Source = null;
887 // FIXME: optimization could be done here.
889 if (s1 [idx1] == s2 [idx2]) {
894 // while (idx1 >= start1 && !IsSafe ((int) s [idx1]))
896 // while (idx2 >= start2 && !IsSafe ((int) s [idx2]))
904 int i1 = FilterOptions (s1 [idx1]);
905 int i2 = FilterOptions (s2 [idx2]);
906 bool special1 = false;
907 bool special2 = false;
909 // If current character is an expander, then
910 // repeat the previous character.
911 ext1 = GetExtenderType (i1);
912 if (ext1 != ExtenderType.None) {
913 if (previousChar < 0) {
914 if (previousSortKey == null) {
919 sk1 = previousSortKey;
922 i1 = FilterExtender (previousChar, ext1);
924 ext2 = GetExtenderType (i2);
925 if (ext2 != ExtenderType.None) {
926 if (previousChar2 < 0) {
927 if (previousSortKey2 == null) {
932 sk2 = previousSortKey2;
935 i2 = FilterExtender (previousChar2, ext2);
938 byte cat1 = Category (i1);
939 byte cat2 = Category (i2);
941 // Handle special weight characters
942 if (!stringSort && currentLevel > 4) {
944 lv5At1 = escape1.Source != null ?
945 escape1.Index - escape1.Start :
947 // here Windows has a bug that it does
948 // not consider thirtiary weight.
949 lv5Value1 = Level1 (i1) << 8 + Uni.Level3 (i1);
954 lv5At2 = escape2.Source != null ?
955 escape2.Index - escape2.Start :
957 // here Windows has a bug that it does
958 // not consider thirtiary weight.
959 lv5Value2 = Level1 (i2) << 8 + Uni.Level3 (i2);
963 if (cat1 == 6 || cat2 == 6) {
969 Contraction ct1 = null;
970 if (ext1 == ExtenderType.None)
971 ct1 = GetContraction (s1, idx1, end1);
976 else if (ct1 != null) {
977 offset1 = ct1.Source.Length;
978 if (ct1.SortKey != null) {
980 for (int i = 0; i < ct1.SortKey.Length; i++)
981 sk1 [i] = ct1.SortKey [i];
983 previousSortKey = sk1;
985 else if (escape1.Source == null) {
987 escape1.Start = start1;
988 escape1.Index = cur1 + ct1.Source.Length;
990 s1 = ct1.Replacement;
1000 sk1 [1] = Level1 (i1);
1001 if (!ignoreNonSpace && currentLevel > 1)
1002 sk1 [2] = Level2 (i1, ext1);
1003 if (currentLevel > 2)
1004 sk1 [3] = Uni.Level3 (i1);
1005 if (currentLevel > 3)
1006 special1 = Uni.HasSpecialWeight ((char) i1);
1011 Contraction ct2 = null;
1012 if (ext2 == ExtenderType.None)
1013 ct2 = GetContraction (s2, idx2, end2);
1017 else if (ct2 != null) {
1018 idx2 += ct2.Source.Length;
1019 if (ct2.SortKey != null) {
1021 for (int i = 0; i < ct2.SortKey.Length; i++)
1022 sk2 [i] = ct2.SortKey [i];
1024 previousSortKey2 = sk2;
1026 else if (escape2.Source == null) {
1027 escape2.Source = s2;
1028 escape2.Start = start2;
1029 escape2.Index = cur2 + ct2.Source.Length;
1031 s2 = ct2.Replacement;
1041 sk2 [1] = Level1 (i2);
1042 if (!ignoreNonSpace && currentLevel > 1)
1043 sk2 [2] = Level2 (i2, ext2);
1044 if (currentLevel > 2)
1045 sk2 [3] = Uni.Level3 (i2);
1046 if (currentLevel > 3)
1047 special2 = Uni.HasSpecialWeight ((char) i2);
1053 // Add offset here so that it does not skip
1054 // idx1 while s2 has a replacement.
1057 // add diacritical marks in s1 here
1058 if (!ignoreNonSpace) {
1059 while (idx1 < end1) {
1060 if (Category (s1 [idx1]) != 1)
1064 sk1 [2] = (byte) (sk1 [2] +
1065 Level2 (s1 [idx1], ExtenderType.None));
1069 // add diacritical marks in s2 here
1070 while (idx2 < end2) {
1071 if (Category (s2 [idx2]) != 1)
1075 sk2 [2] = (byte) (sk2 [2] +
1076 Level2 (s2 [idx2], ExtenderType.None));
1081 int ret = sk1 [0] - sk2 [0];
1082 ret = ret != 0 ? ret : sk1 [1] - sk2 [1];
1085 if (currentLevel == 1)
1087 if (!ignoreNonSpace) {
1088 ret = sk1 [2] - sk2 [2];
1091 currentLevel = frenchSort ? 2 : 1;
1095 if (currentLevel == 2)
1097 ret = sk1 [3] - sk2 [3];
1103 if (currentLevel == 3)
1105 if (special1 != special2) {
1106 finalResult = special1 ? 1 : -1;
1111 ret = CompareFlagPair (
1112 !Uni.IsJapaneseSmallLetter ((char) i1),
1113 !Uni.IsJapaneseSmallLetter ((char) i2));
1114 ret = ret != 0 ? ret :
1115 ToDashTypeValue (ext1) -
1116 ToDashTypeValue (ext2);
1117 ret = ret != 0 ? ret : CompareFlagPair (
1118 Uni.IsHiragana ((char) i1),
1119 Uni.IsHiragana ((char) i2));
1120 ret = ret != 0 ? ret : CompareFlagPair (
1121 !IsHalfKana ((char) i1),
1122 !IsHalfKana ((char) i2));
1131 // If there were only level 3 or lower differences,
1132 // then we still have to find diacritical differences
1134 if (!ignoreNonSpace &&
1135 finalResult != 0 && currentLevel > 2) {
1136 while (idx1 < end1 && idx2 < end2) {
1137 if (!Uni.IsIgnorableNonSpacing (s1 [idx1]))
1139 if (!Uni.IsIgnorableNonSpacing (s2 [idx2]))
1141 finalResult = Level2 (FilterOptions ((s1 [idx1])), ext1) - Level2 (FilterOptions (s2 [idx2]), ext2);
1142 if (finalResult != 0)
1146 // they should work only for the first character
1147 ext1 = ExtenderType.None;
1148 ext2 = ExtenderType.None;
1151 // we still have to handle level 5
1152 if (finalResult == 0) {
1153 finalResult = lv5At1 - lv5At2;
1154 if (finalResult == 0)
1155 finalResult = lv5Value1 - lv5Value2;
1157 if (finalResult == 0) {
1159 targetConsumed = true;
1161 sourceConsumed = true;
1163 return idx1 != end1 ? 1 : idx2 == end2 ? finalResult : -1;
1166 int CompareFlagPair (bool b1, bool b2)
1168 return b1 == b2 ? 0 : b1 ? 1 : -1;
1173 #region IsPrefix() and IsSuffix()
1175 public bool IsPrefix (string src, string target, CompareOptions opt)
1177 return IsPrefix (src, target, 0, src.Length, opt);
1180 public bool IsPrefix (string s, string target, int start, int length, CompareOptions opt)
1183 return IsPrefix (s, target, start, length,
1184 (opt & CompareOptions.StringSort) != 0, true);
1187 public bool IsPrefix (string s, string target, int start, int length, bool stringSort, bool skipHeadingExtenders)
1189 bool consumed, dummy;
1190 int ret = CompareInternal (s, start, length,
1191 target, 0, target.Length, stringSort,
1192 out consumed, out dummy, skipHeadingExtenders);
1198 public bool IsSuffix (string src, string target, CompareOptions opt)
1200 return IsSuffix (src, target, src.Length - 1, src.Length, opt);
1203 public bool IsSuffix (string s, string target, int start, int length, CompareOptions opt)
1205 // quick check : simple codepoint comparison
1206 if (s.Length >= target.Length) {
1208 int se = start - length;
1209 for (int i = target.Length - 1; si >= se && i >= 0; i--, si--)
1210 if (s [si] != target [i])
1212 if (si == start + target.Length)
1217 return IsSuffix (s, target, start, length,
1218 (opt & CompareOptions.StringSort) != 0);
1221 bool IsSuffix (string s, string t, int start, int length,
1225 for (;tstart < t.Length; tstart++)
1226 if (!IsIgnorable (t [tstart]))
1228 if (tstart == t.Length)
1229 return true; // as if target is String.Empty.
1231 // FIXME: it is not efficient.
1232 bool sourceConsumed, targetConsumed;
1233 for (int i = 0; i < length; i++) {
1234 escape1.Source = escape2.Source = null;
1235 previousSortKey = previousSortKey2 = null;
1236 previousChar = previousChar2 = -1;
1238 int ret = CompareInternal (s, start - i, i + 1,
1239 t, tstart, t.Length - tstart,
1240 stringSort, out targetConsumed,
1241 out sourceConsumed, true);
1244 if (!sourceConsumed && targetConsumed)
1252 #region IndexOf() / LastIndexOf()
1254 public int IndexOf (string s, string target, CompareOptions opt)
1256 return IndexOf (s, target, 0, s.Length, opt);
1259 public int IndexOf (string s, string target, int start, int length, CompareOptions opt)
1262 return IndexOf (s, target, start, length,
1263 (opt & CompareOptions.StringSort) != 0);
1266 public int IndexOf (string s, char target, CompareOptions opt)
1268 return IndexOf (s, target, 0, s.Length, opt);
1271 public int IndexOf (string s, char target, int start, int length, CompareOptions opt)
1275 // If target is contraction, then use string search.
1276 Contraction ct = GetContraction (target);
1278 if (ct.Replacement != null)
1279 return IndexOf (s, ct.Replacement, start, length,
1280 (opt & CompareOptions.StringSort) != 0);
1282 return IndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true);
1284 int ti = FilterOptions ((int) target);
1285 charSortKeyIndexTarget [0] = Category (ti);
1286 charSortKeyIndexTarget [1] = Level1 (ti);
1287 if (!ignoreNonSpace)
1288 charSortKeyIndexTarget [2] =
1289 Level2 (ti, ExtenderType.None);
1290 charSortKeyIndexTarget [3] = Uni.Level3 (ti);
1291 return IndexOfSortKey (s, start, length,
1292 charSortKeyIndexTarget, target, ti,
1293 !Uni.HasSpecialWeight ((char) ti));
1297 // Searches target byte[] keydata
1298 int IndexOfSortKey (string s, int start, int length, byte [] sortkey, char target, int ti, bool noLv4)
1300 int end = start + length;
1304 if (MatchesForward (s, ref idx, end, ti, sortkey, noLv4))
1310 // Searches string. Search head character (or keydata when
1311 // the head is contraction sortkey) and try IsPrefix().
1312 int IndexOf (string s, string target, int start, int length, bool stringSort)
1315 for (; tidx < target.Length; tidx++)
1316 if (!IsIgnorable (target [tidx]))
1318 if (tidx == target.Length)
1320 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1321 string replace = ct != null ? ct.Replacement : null;
1322 byte [] sk = replace == null ? charSortKeyIndexTarget : null;
1324 char tc = char.MinValue;
1326 if (ct != null && sk != null) {
1327 for (int i = 0; i < ct.SortKey.Length; i++)
1328 sk [i] = ct.SortKey [i];
1329 } else if (sk != null) {
1331 ti = FilterOptions (target [tidx]);
1332 sk [0] = Category (ti);
1333 sk [1] = Level1 (ti);
1334 if (!ignoreNonSpace)
1335 sk [2] = Level2 (ti, ExtenderType.None);
1336 sk [3] = Uni.Level3 (ti);
1337 noLv4 = !Uni.HasSpecialWeight ((char) ti);
1340 for (tidx++; tidx < target.Length; tidx++) {
1341 if (Category (target [tidx]) != 1)
1345 sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None));
1351 if (replace != null)
1352 idx = IndexOf (s, replace, start, length, stringSort);
1354 idx = IndexOfSortKey (s, start, length, sk, tc, ti, noLv4);
1357 length -= idx - start;
1359 if (IsPrefix (s, target, start, length, stringSort, false))
1361 Contraction cts = GetContraction (s, start, length);
1363 start += cts.Source.Length;
1364 length -= cts.Source.Length;
1369 } while (length > 0);
1374 // There are the same number of IndexOf() related methods,
1375 // with the same functionalities for each.
1378 public int LastIndexOf (string s, string target, CompareOptions opt)
1380 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1383 public int LastIndexOf (string s, string target, int start, int length, CompareOptions opt)
1386 return LastIndexOf (s, target, start, length,
1387 (opt & CompareOptions.StringSort) != 0);
1390 public int LastIndexOf (string s, char target, CompareOptions opt)
1392 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1395 public int LastIndexOf (string s, char target, int start, int length, CompareOptions opt)
1399 // If target is contraction, then use string search.
1400 Contraction ct = GetContraction (target);
1402 if (ct.Replacement != null)
1403 return LastIndexOf (s, ct.Replacement, start, length, (opt & CompareOptions.StringSort) != 0);
1405 return LastIndexOfSortKey (s, start, start, length, ct.SortKey, char.MinValue, -1, true);
1408 int ti = FilterOptions ((int) target);
1409 charSortKeyIndexTarget [0] = Category (ti);
1410 charSortKeyIndexTarget [1] = Level1 (ti);
1411 if (!ignoreNonSpace)
1412 charSortKeyIndexTarget [2] = Level2 (ti, ExtenderType.None);
1413 charSortKeyIndexTarget [3] = Uni.Level3 (ti);
1414 return LastIndexOfSortKey (s, start, start, length, charSortKeyIndexTarget, target, ti, !Uni.HasSpecialWeight ((char) ti));
1418 // Searches target byte[] keydata
1419 int LastIndexOfSortKey (string s, int start, int orgStart, int length, byte [] sortkey, char target, int ti, bool noLv4)
1421 int end = start - length;
1425 if (MatchesBackward (s, ref idx, end, orgStart, ti, sortkey, noLv4))
1431 // Searches string. Search head character (or keydata when
1432 // the head is contraction sortkey) and try IsPrefix().
1433 int LastIndexOf (string s, string target, int start, int length, bool stringSort)
1435 int orgStart = start;
1437 for (; tidx < target.Length; tidx++)
1438 if (!IsIgnorable (target [tidx]))
1440 if (tidx == target.Length)
1442 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1443 string replace = ct != null ? ct.Replacement : null;
1444 byte [] sk = replace == null ? charSortKeyIndexTarget : null;
1447 char tc = char.MinValue;
1449 if (ct != null && sk != null) {
1450 for (int i = 0; i < ct.SortKey.Length; i++)
1451 sk [i] = ct.SortKey [i];
1452 } else if (sk != null) {
1454 ti = FilterOptions (target [tidx]);
1455 sk [0] = Category (ti);
1456 sk [1] = Level1 (ti);
1457 if (!ignoreNonSpace)
1458 sk [2] = Level2 (ti, ExtenderType.None);
1459 sk [3] = Uni.Level3 (ti);
1460 noLv4 = !Uni.HasSpecialWeight ((char) ti);
1463 for (tidx++; tidx < target.Length; tidx++) {
1464 if (Category (target [tidx]) != 1)
1468 sk [2] = (byte) (sk [2] + Level2 (target [tidx], ExtenderType.None));
1475 if (replace != null)
1476 idx = LastIndexOf (s, replace, start, length, stringSort);
1478 idx = LastIndexOfSortKey (s, start, orgStart, length, sk, tc, ti, noLv4);
1481 length -= start - idx;
1484 if (IsPrefix (s, target, idx, orgStart - idx + 1, stringSort, false)) {
1485 for (;idx < orgStart; idx++)
1486 if (!IsIgnorable (s [idx]))
1490 Contraction cts = GetContraction (s, idx, orgStart - idx + 1);
1492 start -= cts.Source.Length;
1493 length -= cts.Source.Length;
1498 } while (length > 0);
1502 private bool MatchesForward (string s, ref int idx, int end, int ti, byte [] sortkey, bool noLv4)
1505 ExtenderType ext = GetExtenderType (s [idx]);
1506 Contraction ct = null;
1507 if (ext == ExtenderType.None)
1508 ct = GetContraction (s, idx, end);
1509 else if (previousChar < 0) {
1510 if (previousSortKey == null) {
1514 charSortKey = previousSortKey;
1517 si = FilterExtender (previousChar, ext);
1518 // if lv4 exists, it never matches contraction
1520 idx += ct.Source.Length;
1523 if (ct.SortKey != null) {
1524 for (int i = 0; i < sortkey.Length; i++)
1525 charSortKey [i] = sortkey [i];
1527 previousSortKey = charSortKey;
1529 // Here is the core of LAMESPEC
1530 // described at the top of the source.
1532 return MatchesForward (ct.Replacement, ref dummy,
1533 ct.Replacement.Length, ti, sortkey, noLv4);
1537 si = FilterOptions (s [idx]);
1538 charSortKey [0] = Category (si);
1539 charSortKey [1] = Level1 (si);
1540 if (!ignoreNonSpace)
1541 charSortKey [2] = Level2 (si, ext);
1542 charSortKey [3] = Uni.Level3 (si);
1543 if (charSortKey [0] != 1)
1547 for (; idx < end; idx++) {
1548 if (Category (s [idx]) != 1)
1552 if (charSortKey [2] == 0)
1553 charSortKey [2] = 2;
1554 charSortKey [2] = (byte) (charSortKey [2]
1555 + Level2 (s [idx], ExtenderType.None));
1558 return MatchesPrimitive (charSortKey, si, ext, sortkey, ti, noLv4);
1561 private bool MatchesPrimitive (byte [] charSortKey, int si, ExtenderType ext, byte [] sortkey, int ti, bool noLv4)
1563 if (charSortKey [0] != sortkey [0] ||
1564 charSortKey [1] != sortkey [1] ||
1565 (!ignoreNonSpace && charSortKey [2] != sortkey [2]) ||
1566 charSortKey [3] != sortkey [3])
1568 if (noLv4 && (si < 0 || !Uni.HasSpecialWeight ((char) si)))
1572 // Since target can never be an extender, if the source
1573 // is an expander and it matters, then they never match.
1574 if (!ignoreNonSpace && ext == ExtenderType.Conditional)
1576 if (Uni.IsJapaneseSmallLetter ((char) si) !=
1577 Uni.IsJapaneseSmallLetter ((char) ti) ||
1578 ToDashTypeValue (ext) !=
1579 // FIXME: we will have to specify correct value for target
1580 ToDashTypeValue (ExtenderType.None) ||
1581 !Uni.IsHiragana ((char) si) !=
1582 !Uni.IsHiragana ((char) ti) ||
1583 IsHalfKana ((char) si) !=
1584 IsHalfKana ((char) ti))
1589 private bool MatchesBackward (string s, ref int idx, int end, int orgStart, int ti, byte [] sortkey, bool noLv4)
1593 ExtenderType ext = GetExtenderType (s [idx]);
1594 // To handle extenders in source, we need to
1595 // check next _primary_ character.
1596 if (ext != ExtenderType.None) {
1597 byte diacritical = 0;
1598 for (int tmp = 0; ; tmp--) {
1599 if (tmp < 0) // heading extender
1601 if (IsIgnorable (s [tmp]))
1603 int tmpi = FilterOptions (s [tmp]);
1604 byte category = Category (tmpi);
1605 if (category == 1) {
1606 diacritical = Level2 (tmpi, ExtenderType.None);
1609 si = FilterExtender (tmpi, ext);
1610 charSortKey [0] = category;
1611 charSortKey [1] = Level1 (si);
1612 if (!ignoreNonSpace)
1613 charSortKey [2] = Level2 (si, ext);
1614 charSortKey [3] = Uni.Level3 (si);
1615 if (ext != ExtenderType.Conditional &&
1618 (charSortKey [2] == 0) ?
1619 (byte) (diacritical + 2) :
1625 Contraction ct = null;
1626 if (ext == ExtenderType.None)
1627 ct = GetContraction (s, idx, end);
1628 // if lv4 exists, it never matches contraction
1630 idx -= ct.Source.Length;
1633 if (ct.SortKey != null) {
1634 for (int i = 0; i < sortkey.Length; i++)
1635 charSortKey [i] = sortkey [i];
1637 previousSortKey = charSortKey;
1639 // Here is the core of LAMESPEC
1640 // described at the top of the source.
1641 int dummy = ct.Replacement.Length - 1;
1642 return MatchesBackward (ct.Replacement,
1643 ref dummy, dummy, -1, ti, sortkey, noLv4);
1645 } else if (ext == ExtenderType.None) {
1647 si = FilterOptions (s [idx]);
1648 charSortKey [0] = Category (si);
1649 charSortKey [1] = Level1 (si);
1650 if (!ignoreNonSpace)
1651 charSortKey [2] = Level2 (si, ext);
1652 charSortKey [3] = Uni.Level3 (si);
1653 if (charSortKey [0] != 1)
1657 if (ext == ExtenderType.None) {
1658 for (int tmp = cur + 1; tmp < orgStart; tmp++) {
1659 if (Category (s [tmp]) != 1)
1663 if (charSortKey [2] == 0)
1664 charSortKey [2] = 2;
1665 charSortKey [2] = (byte) (charSortKey [2]
1666 + Level2 (s [tmp], ExtenderType.None));
1669 return MatchesPrimitive (charSortKey, si, ext, sortkey, ti, noLv4);