4 // This class will demonstrate CompareInfo functionality that will just work.
8 // Here's a summary for supporting contractions, expansions and diacritical
11 // Diacritical mapping is a simple tailoring rule that "remaps" diacritical
12 // weight value from one to another. For now it applies to all range of
13 // characters, but at some stage we might need to limit the remapping ranges.
15 // A Contraction consists of a string (actually char[]) and a sortkey entry
16 // (i.e. byte[]). It indicates that a sequence of characters is interpreted
17 // as a single character that has the mapped sortkey value. There is no
18 // character which goes across "special rules". When the collator encountered
19 // such a sequence of characters, it just returns the sortkey value without
20 // further replacement.
22 // Since it is still likely to happen that a contraction sequence matches
23 // other character than the identical sequence (depending on CompareOptions
24 // and of course, defined sortkey value itself), comparison cannot be done
25 // at source char[] level.
31 // In IndexOf(), the first character in the target string or the target char
32 // itself is turned into sortkey bytes. If the character has a contraction and
33 // that is sortkey map, then it is used instead. If the contraction exists and
34 // that is replacement map, then the first character of the replacement string
35 // is searched instead. IndexOf() always searches only for the top character,
36 // and if it returned negative value, then it returns -1. Otherwise, it then
37 // tries IsPrefix() from that location. If it returns true, then it returns
41 // LAMESPEC: IndexOf() is lame as a whole API. It never matches in the middle
42 // of expansion and there is no proper way to return such indexes within
43 // a single int return value.
45 // For example, try below in .NET:
46 // IndexOf("\u00E6", "a")
47 // IndexOf("\u00E6", "e")
52 using System.Collections;
53 using System.Globalization;
55 using Uni = Mono.Globalization.Unicode.MSCompatUnicodeTable;
56 using UUtil = Mono.Globalization.Unicode.MSCompatUnicodeTableUtil;
58 namespace Mono.Globalization.Unicode
60 internal class SimpleCollator
62 static SimpleCollator invariant =
63 new SimpleCollator (CultureInfo.InvariantCulture);
65 internal static readonly byte [] ignorableFlags =
67 internal static readonly byte [] categories =
69 internal static readonly byte [] level1 =
71 internal static readonly byte [] level2 =
73 internal static readonly byte [] level3 =
75 internal static readonly ushort [] widthCompat =
77 internal static readonly CodePointIndexer categoryIndexer =
79 internal static readonly CodePointIndexer lv1Indexer =
81 internal static readonly CodePointIndexer lv2Indexer =
83 internal static readonly CodePointIndexer lv3Indexer =
85 internal static readonly CodePointIndexer widthIndexer =
90 // CompareOptions expanded.
91 bool ignoreNonSpace; // used in IndexOf()
96 TextInfo textInfo; // for ToLower().
98 readonly ushort [] cjkTable;
99 readonly CodePointIndexer cjkIndexer;
100 readonly byte [] cjkLv2Table;
101 readonly CodePointIndexer cjkLv2Indexer;
103 readonly Contraction [] contractions;
104 readonly Level2Map [] level2Maps;
106 // temporary sortkey buffer for index search/comparison
107 byte [] charSortKey = new byte [4];
108 byte [] charSortKey2 = new byte [4];
109 // temporary expansion store for IsPrefix/Suffix
110 int escapedSourceIndex;
112 #region Tailoring support classes
113 // Possible mapping types are:
115 // - string to string (ReplacementMap)
116 // - string to SortKey (SortKeyMap)
117 // - diacritical byte to byte (DiacriticalMap)
119 // There could be mapping from string to sortkeys, but
120 // for now there is none as such.
122 internal class Contraction
124 public readonly char [] Source;
125 // only either of them is used.
126 public readonly string Replacement;
127 public readonly byte [] SortKey;
129 public Contraction (char [] source,
130 string replacement, byte [] sortkey)
133 Replacement = replacement;
138 internal class ContractionComparer : IComparer
140 public static readonly ContractionComparer Instance =
141 new ContractionComparer ();
143 public int Compare (object o1, object o2)
145 Contraction c1 = (Contraction) o1;
146 Contraction c2 = (Contraction) o2;
147 char [] a1 = c1.Source;
148 char [] a2 = c2.Source;
149 int min = a1.Length > a2.Length ?
150 a2.Length : a1.Length;
151 for (int i = 0; i < min; i++)
152 if (a1 [i] != a2 [i])
153 return a1 [i] - a2 [i];
154 return a1.Length - a2.Length;
158 internal class Level2Map
163 public Level2Map (byte source, byte replace)
170 internal class Level2MapComparer : IComparer
172 public static readonly Level2MapComparer Instance =
173 new Level2MapComparer ();
175 public int Compare (object o1, object o2)
177 Level2Map m1 = (Level2Map) o1;
178 Level2Map m2 = (Level2Map) o2;
179 return (m1.Source - m2.Source);
185 #region .ctor() and split functions
187 public SimpleCollator (CultureInfo culture)
190 textInfo = culture.TextInfo;
191 buf = new SortKeyBuffer (culture.LCID);
193 SetCJKTable (culture, ref cjkTable, ref cjkIndexer,
194 ref cjkLv2Table, ref cjkLv2Indexer);
196 // Get tailoring info
197 TailoringInfo t = null;
198 for (CultureInfo ci = culture; ci.LCID != 127; ci = ci.Parent) {
199 t = Uni.GetTailoringInfo (ci.LCID);
203 if (t == null) // then use invariant
204 t = Uni.GetTailoringInfo (127);
206 frenchSort = t.FrenchSort;
207 BuildTailoringTables (culture, t, ref contractions,
209 // FIXME: Since tailorings are mostly for latin
210 // (and in some cases Cyrillic) characters, it would
211 // be much better for performance to store "start
212 // indexes" for > 370 (culture-specific letters).
215 // dump tailoring table
216 Console.WriteLine ("******** building table for {0} : c - {1} d - {2}",
217 culture.LCID, contractions.Length, level2Maps.Length);
218 foreach (Contraction c in contractions) {
219 foreach (char cc in c.Source)
220 Console.Write ("{0:X4} ", (int) cc);
221 Console.WriteLine (" -> '{0}'", c.Replacement);
226 private void BuildTailoringTables (CultureInfo culture,
228 ref Contraction [] contractions,
229 ref Level2Map [] diacriticals)
231 // collect tailoring entries.
232 ArrayList cmaps = new ArrayList ();
233 ArrayList dmaps = new ArrayList ();
234 char [] tarr = Uni.TailoringValues;
235 int idx = t.TailoringIndex;
236 int end = idx + t.TailoringCount;
240 switch (tarr [idx]) {
241 case '\x1': // SortKeyMap
243 while (tarr [ss] != 0)
245 src = new char [ss - idx];
246 Array.Copy (tarr, idx, src, 0, ss - idx);
247 byte [] sortkey = new byte [4];
248 for (int i = 0; i < 4; i++)
249 sortkey [i] = (byte) tarr [ss + 1 + i];
250 cmaps.Add (new Contraction (
251 src, null, sortkey));
255 case '\x2': // DiacriticalMap
256 dmaps.Add (new Level2Map (
257 (byte) tarr [idx + 1],
258 (byte) tarr [idx + 2]));
261 case '\x3': // ReplacementMap
263 while (tarr [ss] != 0)
265 src = new char [ss - idx];
266 Array.Copy (tarr, idx, src, 0, ss - idx);
269 while (tarr [l] != 0)
271 string r = new string (tarr, ss, l - ss);
272 cmaps.Add (new Contraction (
277 throw new NotImplementedException (String.Format ("Mono INTERNAL ERROR (Should not happen): Collation tailoring table is broken for culture {0} ({1}) at 0x{2:X}", culture.LCID, culture.Name, idx));
280 cmaps.Sort (ContractionComparer.Instance);
281 dmaps.Sort (Level2MapComparer.Instance);
282 contractions = cmaps.ToArray (typeof (Contraction))
284 diacriticals = dmaps.ToArray (typeof (Level2Map))
288 private void SetCJKTable (CultureInfo culture,
289 ref ushort [] cjkTable, ref CodePointIndexer cjkIndexer,
290 ref byte [] cjkLv2Table, ref CodePointIndexer cjkLv2Indexer)
292 string name = GetNeutralCulture (culture).Name;
296 // custom CJK table support.
299 cjkTable = Uni.CjkCHS;
300 cjkIndexer = UUtil.CjkCHS;
303 cjkTable = Uni.CjkCHT;
304 cjkIndexer = UUtil.Cjk;
307 cjkTable = Uni.CjkJA;
308 cjkIndexer = UUtil.Cjk;
311 cjkTable = Uni.CjkKO;
312 cjkLv2Table = Uni.CjkKOLv2;
313 cjkIndexer = UUtil.Cjk;
314 cjkLv2Indexer = UUtil.Cjk;
319 static CultureInfo GetNeutralCulture (CultureInfo info)
321 CultureInfo ret = info;
322 while (ret.Parent != null && ret.Parent.LCID != 127)
329 byte Category (int cp)
331 if (cp < 0x3000 || cjkTable == null)
332 return categories [categoryIndexer.ToIndex (cp)];
333 int idx = cjkIndexer.ToIndex (cp);
334 ushort cjk = idx < 0 ? (ushort) 0 : cjkTable [idx];
335 return cjk != 0 ? (byte) ((cjk & 0xFF00) >> 8) :
336 categories [categoryIndexer.ToIndex (cp)];
341 if (cp < 0x3000 || cjkTable == null)
342 return level1 [lv1Indexer.ToIndex (cp)];
343 int idx = cjkIndexer.ToIndex (cp);
344 ushort cjk = idx < 0 ? (ushort) 0 : cjkTable [idx];
345 return cjk != 0 ? (byte) (cjk & 0xFF) :
346 level1 [lv1Indexer.ToIndex (cp)];
349 byte Level2 (int cp, ExtenderType ext)
351 if (ext == ExtenderType.Buggy)
353 else if (ext == ExtenderType.Conditional)
356 if (cp < 0x3000 || cjkLv2Table == null)
357 return level2 [lv2Indexer.ToIndex (cp)];
358 int idx = cjkLv2Indexer.ToIndex (cp);
359 byte ret = idx < 0 ? (byte) 0 : cjkLv2Table [idx];
362 ret = level2 [lv2Indexer.ToIndex (cp)];
363 if (level2Maps.Length == 0)
365 for (int i = 0; i < level2Maps.Length; i++) {
366 if (level2Maps [i].Source == ret)
367 return level2Maps [i].Replace;
368 else if (level2Maps [i].Source > ret)
374 bool IsHalfKana (int cp)
376 return ignoreWidth || Uni.IsHalfWidthKana ((char) cp);
379 void SetOptions (CompareOptions options)
381 this.ignoreNonSpace = (options & CompareOptions.IgnoreNonSpace) != 0;
382 this.ignoreSymbols = (options & CompareOptions.IgnoreSymbols) != 0;
383 this.ignoreWidth = (options & CompareOptions.IgnoreWidth) != 0;
384 this.ignoreCase = (options & CompareOptions.IgnoreCase) != 0;
385 this.ignoreKanaType = (options & CompareOptions.IgnoreKanaType) != 0;
388 Contraction GetContraction (string s, int start, int end)
390 Contraction c = GetContraction (s, start, end, contractions);
391 if (c != null || lcid == 127)
393 return GetContraction (s, start, end, invariant.contractions);
396 Contraction GetContraction (string s, int start, int end, Contraction [] clist)
398 for (int i = 0; i < clist.Length; i++) {
399 Contraction ct = clist [i];
400 int diff = ct.Source [0] - s [start];
402 return null; // it's already sorted
405 char [] chars = ct.Source;
406 if (end - start < chars.Length)
409 for (int n = 0; n < chars.Length; n++)
410 if (s [start + n] != chars [n]) {
420 Contraction GetTailContraction (string s, int start, int end)
422 Contraction c = GetTailContraction (s, start, end, contractions);
423 if (c != null || lcid == 127)
425 return GetTailContraction (s, start, end, invariant.contractions);
428 Contraction GetTailContraction (string s, int start, int end, Contraction [] clist)
430 for (int i = 0; i < clist.Length; i++) {
431 Contraction ct = clist [i];
432 int diff = ct.Source [0] - s [end];
434 return null; // it's already sorted
437 char [] chars = ct.Source;
438 if (start - end + 1 < chars.Length)
441 int offset = start - chars.Length + 1;
442 for (int n = 0; n < chars.Length; n++)
443 if (s [offset + n] != chars [n]) {
453 Contraction GetContraction (char c)
455 Contraction ct = GetContraction (c, contractions);
456 if (ct != null || lcid == 127)
458 return GetContraction (c, invariant.contractions);
461 Contraction GetContraction (char c, Contraction [] clist)
463 for (int i = 0; i < clist.Length; i++) {
464 Contraction ct = clist [i];
465 if (ct.Source [0] > c)
466 return null; // it's already sorted
467 if (ct.Source [0] == c && ct.Source.Length == 1)
473 int FilterOptions (int i)
476 int x = widthCompat [widthIndexer.ToIndex (i)];
481 i = textInfo.ToLower ((char) i);
483 i = Uni.ToKanaTypeInsensitive (i);
487 int previousChar = -1;
488 byte [] previousSortKey = null;
489 int previousChar2 = -1;
490 byte [] previousSortKey2 = null;
500 ExtenderType GetExtenderType (int i)
502 // LAMESPEC: Windows expects true for U+3005, but
503 // sometimes it does not represent to repeat just
505 // Windows also expects true for U+3031 and U+3032,
506 // but they should *never* repeat one character.
508 if (i < 0x3005 || i > 0xFF70)
509 return ExtenderType.None;
510 if (i == 0xFE7C || i == 0xFE7D)
511 return ExtenderType.Simple;
513 return ExtenderType.Conditional;
515 return ExtenderType.None;
517 case 0x3005: // LAMESPEC: see above.
518 return ExtenderType.Buggy;
519 case 0x3031: // LAMESPEC: see above.
520 case 0x3032: // LAMESPEC: see above.
523 return ExtenderType.Simple;
526 return ExtenderType.Voiced;
528 return ExtenderType.Conditional;
530 return ExtenderType.None;
534 byte ToDashTypeValue (ExtenderType ext)
536 if (ignoreNonSpace) // LAMESPEC: huh, why?
539 case ExtenderType.None:
541 case ExtenderType.Conditional:
548 int FilterExtender (int i, ExtenderType ext)
550 if (ext == ExtenderType.Conditional &&
551 Uni.HasSpecialWeight ((char) i)) {
552 bool half = IsHalfKana ((char) i);
553 bool katakana = !Uni.IsHiragana ((char) i);
554 switch (Level1 (i) & 7) {
556 return half ? 0xFF71 : katakana ? 0x30A2 : 0x3042;
558 return half ? 0xFF72 : katakana ? 0x30A4 : 0x3044;
560 return half ? 0xFF73 : katakana ? 0x30A6 : 0x3046;
562 return half ? 0xFF74 : katakana ? 0x30A8 : 0x3048;
564 return half ? 0xFF75 : katakana ? 0x30AA : 0x304A;
570 bool IsIgnorable (int i)
572 return Uni.IsIgnorable (i) ||
573 ignoreSymbols && Uni.IsIgnorableSymbol (i) ||
574 ignoreNonSpace && Uni.IsIgnorableNonSpacing (i);
580 public SortKey GetSortKey (string s)
582 return GetSortKey (s, CompareOptions.None);
585 public SortKey GetSortKey (string s, CompareOptions options)
587 return GetSortKey (s, 0, s.Length, options);
590 public SortKey GetSortKey (string s, int start, int length, CompareOptions options)
592 SetOptions (options);
594 buf.Initialize (options, lcid, s, frenchSort);
595 int end = start + length;
597 GetSortKey (s, start, end);
598 return buf.GetResultAndReset ();
601 void GetSortKey (string s, int start, int end)
603 for (int n = start; n < end; n++) {
606 ExtenderType ext = GetExtenderType (i);
607 if (ext != ExtenderType.None) {
608 i = FilterExtender (previousChar, ext);
610 FillSortKeyRaw (i, ext);
611 else if (previousSortKey != null) {
612 byte [] b = previousSortKey;
616 b [2] != 1 ? b [2] : Level2 (i, ext),
617 b [3] != 1 ? b [3] : level3 [lv3Indexer.ToIndex (i)]);
619 // otherwise do nothing.
620 // (if the extender is the first char
621 // in the string, then just ignore.)
627 i = FilterOptions (i);
629 Contraction ct = GetContraction (s, n, end);
631 if (ct.Replacement != null) {
632 GetSortKey (ct.Replacement, 0, ct.Replacement.Length);
634 byte [] b = ct.SortKey;
638 b [2] != 1 ? b [2] : Level2 (i, ext),
639 b [3] != 1 ? b [3] : level3 [lv3Indexer.ToIndex (i)]);
643 n += ct.Source.Length - 1;
646 if (!Uni.IsIgnorableNonSpacing (i))
648 FillSortKeyRaw (i, ExtenderType.None);
653 void FillSortKeyRaw (int i, ExtenderType ext)
655 if (0x3400 <= i && i <= 0x4DB5) {
656 int diff = i - 0x3400;
657 buf.AppendCJKExtension (
658 (byte) (0x10 + diff / 254),
659 (byte) (diff % 254 + 2));
663 UnicodeCategory uc = char.GetUnicodeCategory ((char) i);
665 case UnicodeCategory.PrivateUse:
666 int diff = i - 0xE000;
668 (byte) (0xE5 + diff / 254),
669 (byte) (diff % 254 + 2),
673 case UnicodeCategory.Surrogate:
674 FillSurrogateSortKeyRaw (i);
678 byte level2 = Level2 (i, ext);
679 if (Uni.HasSpecialWeight ((char) i)) {
680 byte level1 = Level1 (i);
686 Uni.IsJapaneseSmallLetter ((char) i),
687 ToDashTypeValue (ext),
688 !Uni.IsHiragana ((char) i),
689 IsHalfKana ((char) i)
691 if (!ignoreNonSpace && ext == ExtenderType.Voiced)
692 // Append voice weight
693 buf.AppendNormal (1, 1, 1, 0);
703 void FillSurrogateSortKeyRaw (int i)
712 lower = (byte) ((i == 0xD800) ? 0x3E : 0x3F);
713 } else if (0xD840 <= i && i < 0xD880) {
717 } else if (0xDB80 <= i && i < 0xDC00) {
718 diffbase = 0xDB80 - 0x40;
722 diffbase = 0xDC00 - 0xF8 + 2;
726 int diff = i - diffbase;
729 (byte) (segment + diff / 254),
730 (byte) (diff % 254 + 2),
739 public int Compare (string s1, string s2)
741 return Compare (s1, s2, CompareOptions.None);
744 public int Compare (string s1, string s2, CompareOptions options)
746 return Compare (s1, 0, s1.Length, s2, 0, s2.Length, options);
751 public string Source;
757 // Those instances are reused not to invoke instantiation
759 Escape escape1 = new Escape ();
760 Escape escape2 = new Escape ();
762 private int CompareOrdinal (string s1, int idx1, int len1,
763 string s2, int idx2, int len2)
765 int min = len1 < len2 ? len1 : len2;
766 int end1 = idx1 + min;
767 int end2 = idx2 + min;
768 for (int i1 = idx1, i2 = idx2;
769 i1 < end1 && i2 < end2; i1++, i2++)
770 if (s1 [i1] != s2 [i2])
771 return s1 [i1] - s2 [i2];
772 return len1 == len2 ? 0 :
773 len1 == min ? - 1 : 1;
776 public int Compare (string s1, int idx1, int len1,
777 string s2, int idx2, int len2, CompareOptions options)
779 // quick equality check
780 if (idx1 == idx2 && len1 == len2 &&
781 Object.ReferenceEquals (s1, s2))
783 // FIXME: this should be done inside Compare() at
785 // int ord = CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
788 if (options == CompareOptions.Ordinal)
789 return CompareOrdinal (s1, idx1, len1, s2, idx2, len2);
791 #if false // stable easy version, depends on GetSortKey().
792 SortKey sk1 = GetSortKey (s1, idx1, len1, options);
793 SortKey sk2 = GetSortKey (s2, idx2, len2, options);
794 byte [] d1 = sk1.KeyData;
795 byte [] d2 = sk2.KeyData;
796 int len = d1.Length > d2.Length ? d2.Length : d1.Length;
797 for (int i = 0; i < len; i++)
798 if (d1 [i] != d2 [i])
799 return d1 [i] < d2 [i] ? -1 : 1;
800 return d1.Length == d2.Length ? 0 : d1.Length < d2.Length ? -1 : 1;
802 SetOptions (options);
803 escape1.Source = null;
804 escape2.Source = null;
805 previousSortKey= previousSortKey2 = null;
806 previousChar = previousChar2 = -1;
808 int ret = CompareInternal (s1, idx1, len1, s2, idx2, len2, (options & CompareOptions.StringSort) != 0, out dummy);
809 return ret == 0 ? 0 : ret < 0 ? -1 : 1;
813 int CompareInternal (string s1, int idx1, int len1, string s2,
814 int idx2, int len2, bool stringSort,
815 out bool targetConsumed)
819 int end1 = idx1 + len1;
820 int end2 = idx2 + len2;
821 targetConsumed = false;
823 // It holds final result that comes from the comparison
824 // at level 2 or lower. Even if Compare() found the
825 // difference at level 2 or lower, it still has to
826 // continue level 1 comparison. FinalResult is used
827 // when there was no further differences.
829 // It holds the comparison level to do. It starts from
830 // 5, and becomes 1 when we do primary-only comparison.
831 int currentLevel = 5;
838 // Skip heading extenders
839 for (; idx1 < end1; idx1++)
840 if (GetExtenderType (s1 [idx1]) == ExtenderType.None)
842 for (; idx2 < end2; idx2++)
843 if (GetExtenderType (s2 [idx2]) == ExtenderType.None)
846 ExtenderType ext1 = ExtenderType.None;
847 ExtenderType ext2 = ExtenderType.None;
850 for (; idx1 < end1; idx1++)
851 if (!IsIgnorable (s1 [idx1]))
853 for (; idx2 < end2; idx2++)
854 if (!IsIgnorable (s2 [idx2]))
858 if (escape1.Source == null)
861 start1 = escape1.Start;
862 idx1 = escape1.Index;
864 escape1.Source = null;
868 if (escape2.Source == null)
871 start2 = escape2.Start;
872 idx2 = escape2.Index;
874 escape2.Source = null;
878 // FIXME: optimization could be done here.
880 if (s1 [idx1] == s2 [idx2]) {
885 // while (idx1 >= start1 && !IsSafe ((int) s [idx1]))
887 // while (idx2 >= start2 && !IsSafe ((int) s [idx2]))
895 int i1 = FilterOptions (s1 [idx1]);
896 int i2 = FilterOptions (s2 [idx2]);
897 bool special1 = false;
898 bool special2 = false;
900 // If current character is an expander, then
901 // repeat the previous character.
902 ext1 = GetExtenderType (i1);
903 if (ext1 != ExtenderType.None) {
904 if (previousChar < 0) {
905 if (previousSortKey == null) {
910 sk1 = previousSortKey;
913 i1 = FilterExtender (previousChar, ext1);
915 ext2 = GetExtenderType (i2);
916 if (ext2 != ExtenderType.None) {
917 if (previousChar2 < 0) {
918 if (previousSortKey2 == null) {
923 sk2 = previousSortKey2;
926 i2 = FilterExtender (previousChar2, ext2);
929 byte cat1 = Category (i1);
930 byte cat2 = Category (i2);
932 // Handle special weight characters
933 if (!stringSort && currentLevel > 4) {
935 lv5At1 = escape1.Source != null ?
936 escape1.Index - escape1.Start :
938 // here Windows has a bug that it does
939 // not consider thirtiary weight.
940 lv5Value1 = Level1 (i1) << 8 + Uni.Level3 (i1);
945 lv5At2 = escape2.Source != null ?
946 escape2.Index - escape2.Start :
948 // here Windows has a bug that it does
949 // not consider thirtiary weight.
950 lv5Value2 = Level1 (i2) << 8 + Uni.Level3 (i2);
954 if (cat1 == 6 || cat2 == 6) {
960 Contraction ct1 = null;
961 if (ext1 == ExtenderType.None)
962 ct1 = GetContraction (s1, idx1, end1);
967 else if (ct1 != null) {
968 offset1 = ct1.Source.Length;
969 if (ct1.SortKey != null) {
971 for (int i = 0; i < ct1.SortKey.Length; i++)
972 sk1 [i] = ct1.SortKey [i];
974 previousSortKey = sk1;
976 else if (escape1.Source == null) {
978 escape1.Start = start1;
979 escape1.Index = cur1 + ct1.Source.Length;
981 s1 = ct1.Replacement;
991 sk1 [1] = Level1 (i1);
992 if (!ignoreNonSpace && currentLevel > 1)
993 sk1 [2] = Level2 (i1, ext1);
994 if (currentLevel > 2)
995 sk1 [3] = Uni.Level3 (i1);
996 if (currentLevel > 3)
997 special1 = Uni.HasSpecialWeight ((char) i1);
1002 Contraction ct2 = null;
1003 if (ext2 == ExtenderType.None)
1004 ct2 = GetContraction (s2, idx2, end2);
1008 else if (ct2 != null) {
1009 idx2 += ct2.Source.Length;
1010 if (ct2.SortKey != null) {
1012 for (int i = 0; i < ct2.SortKey.Length; i++)
1013 sk2 [i] = ct2.SortKey [i];
1015 previousSortKey2 = sk2;
1017 else if (escape2.Source == null) {
1018 escape2.Source = s2;
1019 escape2.Start = start2;
1020 escape2.Index = cur2 + ct2.Source.Length;
1022 s2 = ct2.Replacement;
1032 sk2 [1] = Level1 (i2);
1033 if (!ignoreNonSpace && currentLevel > 1)
1034 sk2 [2] = Level2 (i2, ext2);
1035 if (currentLevel > 2)
1036 sk2 [3] = Uni.Level3 (i2);
1037 if (currentLevel > 3)
1038 special2 = Uni.HasSpecialWeight ((char) i2);
1044 // Add offset here so that it does not skip
1045 // idx1 while s2 has a replacement.
1048 // add diacritical marks in s1 here
1049 if (!ignoreNonSpace) {
1050 while (idx1 < end1) {
1051 if (Category (s1 [idx1]) != 1)
1055 sk1 [2] = (byte) (sk1 [2] +
1056 Level2 (s1 [idx1], ExtenderType.None));
1060 // add diacritical marks in s2 here
1061 while (idx2 < end2) {
1062 if (Category (s2 [idx2]) != 1)
1066 sk2 [2] = (byte) (sk2 [2] +
1067 Level2 (s2 [idx2], ExtenderType.None));
1072 int ret = sk1 [0] - sk2 [0];
1073 ret = ret != 0 ? ret : sk1 [1] - sk2 [1];
1076 if (currentLevel == 1)
1078 if (!ignoreNonSpace) {
1079 ret = sk1 [2] - sk2 [2];
1082 currentLevel = frenchSort ? 2 : 1;
1086 if (currentLevel == 2)
1088 ret = sk1 [3] - sk2 [3];
1094 if (currentLevel == 3)
1096 if (special1 != special2) {
1097 finalResult = special1 ? 1 : -1;
1102 ret = CompareFlagPair (
1103 !Uni.IsJapaneseSmallLetter ((char) i1),
1104 !Uni.IsJapaneseSmallLetter ((char) i2));
1105 ret = ret != 0 ? ret :
1106 ToDashTypeValue (ext1) -
1107 ToDashTypeValue (ext2);
1108 ret = ret != 0 ? ret : CompareFlagPair (
1109 Uni.IsHiragana ((char) i1),
1110 Uni.IsHiragana ((char) i2));
1111 ret = ret != 0 ? ret : CompareFlagPair (
1112 !IsHalfKana ((char) i1),
1113 !IsHalfKana ((char) i2));
1122 // If there were only level 3 or lower differences,
1123 // then we still have to find diacritical differences
1125 if (!ignoreNonSpace &&
1126 finalResult != 0 && currentLevel > 2) {
1127 while (idx1 < end1 && idx2 < end2) {
1128 if (!Uni.IsIgnorableNonSpacing (s1 [idx1]))
1130 if (!Uni.IsIgnorableNonSpacing (s2 [idx2]))
1132 finalResult = Level2 (FilterOptions ((s1 [idx1])), ext1) - Level2 (FilterOptions (s2 [idx2]), ext2);
1133 if (finalResult != 0)
1137 // they should work only for the first character
1138 ext1 = ExtenderType.None;
1139 ext2 = ExtenderType.None;
1142 // we still have to handle level 5
1143 if (finalResult == 0) {
1144 finalResult = lv5At1 - lv5At2;
1145 if (finalResult == 0)
1146 finalResult = lv5Value1 - lv5Value2;
1148 if (finalResult == 0 && idx2 == end2)
1149 targetConsumed = true;
1150 return idx1 != end1 ? 1 : idx2 == end2 ? finalResult : -1;
1155 #region IsPrefix() and IsSuffix()
1157 public bool IsPrefix (string src, string target, CompareOptions opt)
1159 return IsPrefix (src, target, 0, src.Length, opt);
1162 public bool IsPrefix (string s, string target, int start, int length, CompareOptions opt)
1165 return IsPrefix (s, target, start, length,
1166 (opt & CompareOptions.StringSort) != 0);
1169 public bool IsPrefix (string s, string target, int start, int length, bool stringSort)
1172 escape1.Source = null;
1173 escape2.Source = null;
1174 previousSortKey= previousSortKey2 = null;
1175 previousChar = previousChar2 = -1;
1176 int ret = CompareInternal (s, start, length,
1177 target, 0, target.Length, stringSort,
1183 public bool IsPrefix (string s, string target, int start, int length, CompareOptions opt)
1186 return IsPrefix (s, target, start, length);
1189 // returns the consumed length in positive number, or -1 if
1190 // target was not a prefix.
1191 bool IsPrefix (string s, string target, int start, int length)
1193 // quick check : simple codepoint comparison
1194 if (s.Length >= target.Length) {
1196 for (int i = 0; si < length && i < target.Length; i++, si++)
1197 if (s [si] != target [i])
1199 if (si == start + target.Length)
1203 escapedSourceIndex = -1;
1204 return IsPrefixInternal (s, target, start, length);
1207 bool IsPrefixInternal (string s, string target, int start, int length)
1210 int end = start + length;
1214 while (ti < target.Length) {
1215 if (IsIgnorable (target [ti])) {
1223 si = escapedSourceIndex;
1224 end = start + length;
1225 escapedSourceIndex = -1;
1228 if (IsIgnorable (s [si])) {
1233 // Check contraction for target.
1234 Contraction ctt = GetContraction (target, ti, target.Length);
1236 ti += ctt.Source.Length;
1237 if (ctt.SortKey != null) {
1238 int ret = GetMatchLength (ref s, ref si, ref end, -1, ctt.SortKey, true);
1243 string r = ctt.Replacement;
1245 while (i < r.Length && si < end) {
1246 int ret = GetMatchLength (ref s, ref si, ref end, r [i]);
1252 if (i < r.Length && si >= end)
1257 int ret = GetMatchLength (ref s, ref si, ref end, target [ti]);
1265 // All codepoints in the compared range
1266 // matches. In that case, what matters
1267 // is whether the remaining part of
1268 // "target" is ignorable or not.
1269 while (ti < target.Length)
1270 if (!IsIgnorable (target [ti++]))
1278 // WARNING: Don't invoke it outside IsPrefix().
1279 int GetMatchLength (ref string s, ref int idx, ref int end, char target)
1281 int it = FilterOptions ((int) target);
1282 charSortKey [0] = Category (it);
1283 charSortKey [1] = Level1 (it);
1284 if (!ignoreNonSpace)
1285 // FIXME: pass ExtenderType
1286 charSortKey [2] = Level2 (it, ExtenderType.None);
1287 charSortKey [3] = Uni.Level3 (it);
1289 return GetMatchLength (ref s, ref idx, ref end, it, charSortKey, !Uni.HasSpecialWeight ((char) it));
1292 // WARNING: Don't invoke it outside IsPrefix().
1293 // returns consumed source length (mostly 1, source length in case of contraction)
1294 int GetMatchLength (ref string s, ref int idx, ref int end, int it, byte [] sortkey, bool noLv4)
1296 Contraction ct = null;
1297 // If there is already expansion, then it should not
1298 // process further expansions.
1299 if (escapedSourceIndex < 0)
1300 ct = GetContraction (s, idx, end);
1302 if (ct.SortKey != null) {
1305 for (int i = 0; i < ct.SortKey.Length; i++)
1306 if (sortkey [i] != ct.SortKey [i])
1308 return ct.Source.Length;
1310 escapedSourceIndex = idx + ct.Source.Length;
1314 return GetMatchLength (ref s, ref idx, ref end, it, sortkey, noLv4);
1317 // primitive comparison
1318 if (Compare (s [idx], it, sortkey) != 0)
1326 public bool IsSuffix (string src, string target, CompareOptions opt)
1328 return IsSuffix (src, target, src.Length - 1, src.Length, opt);
1331 public bool IsSuffix (string s, string target, int start, int length, CompareOptions opt)
1335 // quick check : simple codepoint comparison
1336 if (s.Length >= target.Length) {
1338 int se = start - length;
1339 for (int i = target.Length - 1; si >= se && i >= 0; i--, si--)
1340 if (s [si] != target [i])
1342 if (si == start + target.Length)
1346 escapedSourceIndex = -1;
1347 return IsSuffix (s, target, start, length);
1350 bool IsSuffix (string s, string target, int start, int length)
1353 int ti = target.Length - 1;
1355 int end = start - length + 1;
1358 if (IsIgnorable (target [ti])) {
1366 si = escapedSourceIndex;
1367 escapedSourceIndex = -1;
1370 if (IsIgnorable (s [si])) {
1375 // Check contraction for target.
1376 Contraction ctt = GetTailContraction (target, ti, 0);
1378 ti -= ctt.Source.Length;
1379 if (ctt.SortKey != null) {
1380 int ret = GetMatchLengthBack (ref s, ref si, ref end, -1, ctt.SortKey, true);
1385 string r = ctt.Replacement;
1386 int i = r.Length - 1;
1387 while (i >= 0 && si >= end) {
1388 int ret = GetMatchLengthBack (ref s, ref si, ref end, r [i]);
1394 if (i >= 0 && si < end)
1399 int ret = GetMatchLengthBack (ref s, ref si, ref end, target [ti]);
1407 // All codepoints in the compared range
1408 // matches. In that case, what matters
1409 // is whether the remaining part of
1410 // "target" is ignorable or not.
1412 if (!IsIgnorable (target [ti--]))
1419 // WARNING: Don't invoke it outside IsSuffix().
1420 int GetMatchLengthBack (ref string s, ref int idx, ref int end, char target)
1422 int it = FilterOptions ((int) target);
1423 charSortKey [0] = Category (it);
1424 charSortKey [1] = Level1 (it);
1425 if (!ignoreNonSpace)
1426 // FIXME: pass extender type
1427 charSortKey [2] = Level2 (it, ExtenderType.None);
1428 charSortKey [3] = Uni.Level3 (it);
1430 return GetMatchLengthBack (ref s, ref idx, ref end, it, charSortKey, !Uni.HasSpecialWeight ((char) it));
1433 // WARNING: Don't invoke it outside IsSuffix().
1434 // returns consumed source length (mostly 1, source length in case of contraction)
1435 int GetMatchLengthBack (ref string s, ref int idx, ref int end, int it, byte [] sortkey, bool noLv4)
1437 Contraction ct = null;
1438 // If there is already expansion, then it should not
1439 // process further expansions.
1440 if (escapedSourceIndex < 0)
1441 ct = GetTailContraction (s, idx, end);
1443 if (ct.SortKey != null) {
1446 for (int i = 0; i < ct.SortKey.Length; i++)
1447 if (sortkey [i] != ct.SortKey [i])
1449 return ct.Source.Length;
1451 escapedSourceIndex = idx - ct.Source.Length;
1455 return GetMatchLength (ref s, ref idx, ref end, it, sortkey, noLv4);
1458 // primitive comparison
1459 if (Compare (s [idx], it, sortkey) != 0)
1465 // Common use methods
1467 // returns comparison result.
1468 private int Compare (char src, int ct, byte [] sortkey)
1470 // char-by-char comparison.
1471 int cs = FilterOptions (src);
1475 int ret = Category (cs) - Category (ct);
1478 ret = Level1 (cs) - Level1 (ct);
1481 if (!ignoreNonSpace) {
1482 // FIXME: pass ExtenderType
1483 ret = Level2 (cs, ExtenderType.None) - Level2 (ct, ExtenderType.None);
1487 ret = Uni.Level3 (cs) - Uni.Level3 (ct);
1490 // lv.4 (only when required). No need to check cj coz
1491 // there is no pair of characters that has the same
1492 // primary level and differs here.
1493 if (!Uni.HasSpecialWeight (src))
1495 char target = (char) ct;
1496 ret = CompareFlagPair (
1497 !Uni.IsJapaneseSmallLetter (src),
1498 !Uni.IsJapaneseSmallLetter (target));
1501 ret = Uni.GetJapaneseDashType (src) -
1502 Uni.GetJapaneseDashType (target);
1505 ret = CompareFlagPair (Uni.IsHiragana (src),
1506 Uni.IsHiragana (target));
1509 ret = CompareFlagPair (!IsHalfKana (src),
1510 !IsHalfKana (target));
1514 int CompareFlagPair (bool b1, bool b2)
1516 return b1 == b2 ? 0 : b1 ? 1 : -1;
1521 #region IndexOf() / LastIndexOf()
1523 // IndexOf (string, string, CompareOptions)
1524 // IndexOf (string, string, int, int, CompareOptions)
1525 // IndexOf (string, char, int, int, CompareOptions)
1526 // IndexOfPrimitiveChar (string, int, int, char)
1527 // IndexOfSortKey (string, int, int, byte[], char, int, bool)
1528 // IndexOf (string, string, int, int)
1530 public int IndexOf (string s, string target, CompareOptions opt)
1532 return IndexOf (s, target, 0, s.Length, opt);
1535 public int IndexOf (string s, string target, int start, int length, CompareOptions opt)
1538 return IndexOf (s, target, start, length,
1539 (opt & CompareOptions.StringSort) != 0);
1542 public int IndexOf (string s, char target, CompareOptions opt)
1544 return IndexOf (s, target, 0, s.Length, opt);
1547 public int IndexOf (string s, char target, int start, int length, CompareOptions opt)
1551 // If target is contraction, then use string search.
1552 Contraction ct = GetContraction (target);
1554 if (ct.Replacement != null)
1555 return IndexOfPrimitiveChar (s, start, length, ct.Replacement [0]);
1557 return IndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true);
1560 return IndexOfPrimitiveChar (s, start, length, target);
1563 // Searches target char w/o checking contractions
1564 int IndexOfPrimitiveChar (string s, int start, int length, char target)
1566 int ti = FilterOptions ((int) target);
1567 charSortKey [0] = Category (ti);
1568 charSortKey [1] = Level1 (ti);
1569 if (!ignoreNonSpace)
1570 // FIXME: pass ExtenderType
1571 charSortKey [2] = Level2 (ti, ExtenderType.None);
1572 charSortKey [3] = Uni.Level3 (ti);
1573 return IndexOfSortKey (s, start, length, charSortKey, target, ti, !Uni.HasSpecialWeight ((char) ti));
1576 // Searches target byte[] keydata
1577 int IndexOfSortKey (string s, int start, int length, byte [] sortkey, char target, int ti, bool noLv4)
1579 int end = start + length;
1580 for (int idx = start; idx < end; idx++) {
1582 if (Matches (s, ref idx, end, ti, target, sortkey, noLv4, false))
1588 // Searches string. Search head character (or keydata when
1589 // the head is contraction sortkey) and try IsPrefix().
1590 int IndexOf (string s, string target, int start, int length, bool stringSort)
1593 for (; tidx < target.Length; tidx++)
1594 if (!IsIgnorable (target [tidx]))
1596 if (tidx == target.Length)
1598 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1599 byte [] sortkey = ct != null ? ct.SortKey : null;
1600 string replace = ct != null ? ct.Replacement : null;
1603 if (sortkey != null)
1604 idx = IndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true);
1605 else if (replace != null)
1606 idx = IndexOf (s, replace, start, length, stringSort);
1608 idx = IndexOfPrimitiveChar (s, start, length, target [tidx]);
1611 if (IsPrefix (s, target, idx, length - (idx - start), stringSort))
1613 Contraction cts = GetContraction (s, idx, length - (idx - start));
1615 start += cts.Source.Length;
1616 length -= cts.Source.Length;
1621 } while (length > 0);
1626 // There are the same number of IndexOf() related methods,
1627 // with the same functionalities for each.
1630 public int LastIndexOf (string s, string target, CompareOptions opt)
1632 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1635 public int LastIndexOf (string s, string target, int start, int length, CompareOptions opt)
1638 return LastIndexOf (s, target, start, length,
1639 (opt & CompareOptions.StringSort) != 0);
1642 public int LastIndexOf (string s, char target, CompareOptions opt)
1644 return LastIndexOf (s, target, s.Length - 1, s.Length, opt);
1647 public int LastIndexOf (string s, char target, int start, int length, CompareOptions opt)
1651 // If target is contraction, then use string search.
1652 Contraction ct = GetContraction (target);
1654 if (ct.Replacement != null)
1655 return LastIndexOfPrimitiveChar (s, start, length, ct.Replacement [0]);
1657 return LastIndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true);
1660 return LastIndexOfPrimitiveChar (s, start, length, target);
1663 // Searches target char w/o checking contractions
1664 int LastIndexOfPrimitiveChar (string s, int start, int length, char target)
1666 int ti = FilterOptions ((int) target);
1667 charSortKey [0] = Category (ti);
1668 charSortKey [1] = Level1 (ti);
1669 if (!ignoreNonSpace)
1670 // FIXME: pass ExtenderType
1671 charSortKey [2] = Level2 (ti, ExtenderType.None);
1672 charSortKey [3] = Uni.Level3 (ti);
1673 return LastIndexOfSortKey (s, start, length, charSortKey, target, ti, !Uni.HasSpecialWeight ((char) ti));
1676 // Searches target byte[] keydata
1677 int LastIndexOfSortKey (string s, int start, int length, byte [] sortkey, char target, int ti, bool noLv4)
1679 int end = start - length;
1681 for (int idx = start; idx > end; idx--) {
1683 if (Matches (s, ref idx, end, ti, target, sortkey, noLv4, true))
1689 // Searches string. Search head character (or keydata when
1690 // the head is contraction sortkey) and try IsPrefix().
1691 int LastIndexOf (string s, string target, int start, int length, bool stringSort)
1693 int orgStart = start;
1695 for (; tidx < target.Length; tidx++)
1696 if (!IsIgnorable (target [tidx]))
1698 if (tidx == target.Length)
1700 Contraction ct = GetContraction (target, tidx, target.Length - tidx);
1701 byte [] sortkey = ct != null ? ct.SortKey : null;
1702 string replace = ct != null ? ct.Replacement : null;
1706 if (sortkey != null)
1707 idx = LastIndexOfSortKey (s, start, length, ct.SortKey, char.MinValue, -1, true);
1708 else if (replace != null)
1709 idx = LastIndexOf (s, replace, start, length, stringSort);
1711 idx = LastIndexOfPrimitiveChar (s, start, length, target [tidx]);
1715 if (IsPrefix (s, target, idx, orgStart - idx + 1, stringSort)) {
1716 for (;idx < orgStart; idx++)
1717 if (!IsIgnorable (s [idx]))
1721 Contraction cts = GetContraction (s, idx, orgStart - idx + 1);
1723 start -= cts.Source.Length;
1724 length -= cts.Source.Length;
1729 } while (length > 0);
1733 private bool Matches (string s, ref int idx, int end, int ti, char target, byte [] sortkey, bool noLv4, bool lastIndexOf)
1735 switch (char.GetUnicodeCategory (s [idx])) {
1736 case UnicodeCategory.PrivateUse:
1737 case UnicodeCategory.Surrogate:
1738 if (s [idx] != target)
1743 char sc = char.MinValue;
1744 Contraction ct = GetContraction (s, idx, end);
1745 // if lv4 exists, it never matches contraction
1746 if (ct != null && noLv4) {
1748 idx -= ct.Source.Length - 1;
1750 idx += ct.Source.Length - 1;
1751 if (ct.SortKey != null) {
1752 for (int i = 0; i < sortkey.Length; i++)
1753 if (ct.SortKey [i] != sortkey [i])
1757 // Here is the core of LAMESPEC
1758 // described at the top of the source.
1759 sc = ct.Replacement [0];
1766 int si = FilterOptions ((int) sc);
1767 if (Category (si) != sortkey [0] ||
1768 Level1 (si) != sortkey [1] ||
1769 // FIXME: pass ExtenderType
1770 !ignoreNonSpace && Level2 (si, ExtenderType.None) != sortkey [2] ||
1771 Uni.Level3 (si) != sortkey [3])
1773 if (noLv4 && !Uni.HasSpecialWeight ((char) si))
1777 if (Uni.IsJapaneseSmallLetter ((char) si) !=
1778 Uni.IsJapaneseSmallLetter ((char) ti) ||
1779 Uni.GetJapaneseDashType ((char) si) !=
1780 Uni.GetJapaneseDashType ((char) ti) ||
1781 !Uni.IsHiragana ((char) si) !=
1782 !Uni.IsHiragana ((char) ti) ||
1783 IsHalfKana ((char) si) !=
1784 IsHalfKana ((char) ti))