Merge pull request #297 from ermshiperete/4959
[mono.git] / mcs / class / corlib / Mono.Globalization.Unicode / SortKeyBuffer.cs
index 0c91cd9943c54e7f7275ccc203e6e7d52404fc76..61e909aa1d97f15ca49ce1134e0f70ad909265b9 100644 (file)
@@ -1,27 +1,62 @@
+//
+// SortKeyBuffer.cs : buffer implementation for GetSortKey()
+//
+// Author:
+//     Atsushi Enomoto  <atsushi@ximian.com>
+//
+// Copyright (C) 2005 Novell, Inc (http://www.novell.com)
+//
+// Permission is hereby granted, free of charge, to any person obtaining
+// a copy of this software and associated documentation files (the
+// "Software"), to deal in the Software without restriction, including
+// without limitation the rights to use, copy, modify, merge, publish,
+// distribute, sublicense, and/or sell copies of the Software, and to
+// permit persons to whom the Software is furnished to do so, subject to
+// the following conditions:
+// 
+// The above copyright notice and this permission notice shall be
+// included in all copies or substantial portions of the Software.
+// 
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+// EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+// MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+// NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+// LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+// OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+// WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+//
+
 using System;
 using System.IO;
 using System.Globalization;
 
-
 namespace Mono.Globalization.Unicode
 {
        // Internal sort key storage that is reused during GetSortKey.
        internal class SortKeyBuffer
        {
-               // l4s = small kana sensitivity, l4t = kana character type,
+               // l4s = small kana sensitivity, l4t = mark type,
                // l4k = katakana flag, l4w = kana width sensitivity
-               int l1, l2, l3, l4s, l4t, l4k, l4w, l5;
                byte [] l1b, l2b, l3b, l4sb, l4tb, l4kb, l4wb, l5b;
-               int level5LastPos;
+//             int level5LastPos;
 
-               public SortKeyBuffer ()
+               string source;
+               int l1, l2, l3, l4s, l4t, l4k, l4w, l5;
+               int lcid;
+               CompareOptions options;
+               bool processLevel2;
+               bool frenchSort;
+               bool frenchSorted;
+
+               public SortKeyBuffer (int lcid)
                {
                }
 
                public void Reset ()
                {
                        l1 = l2 = l3 = l4s = l4t = l4k = l4w = l5 = 0;
-                       level5LastPos = 0;
+//                     level5LastPos = 0;
+                       frenchSorted = false;
                }
 
                // It is used for CultureInfo.ClearCachedData().
@@ -30,18 +65,25 @@ namespace Mono.Globalization.Unicode
                        l1b = l2b = l3b = l4sb = l4tb = l4kb = l4wb = l5b = null;
                }
 
-               internal void AdjustBufferSize (string s)
+               internal void Initialize (CompareOptions options, int lcid, string s, bool frenchSort)
                {
+                       this.source = s;
+                       this.lcid = lcid;
+                       this.options = options;
+                       int len = s.Length;
+                       processLevel2 = (options & CompareOptions.IgnoreNonSpace) == 0;
+                       this.frenchSort = frenchSort;
+
                        // For Korean text it is likely to be much bigger (for
                        // Jamo), but even in ko-KR most of the compared
                        // strings won't be Hangul.
-                       if (l1b == null || l1b.Length < s.Length)
-                               l1b = new byte [s.Length * 2 + 10];
+                       if (l1b == null || l1b.Length < len)
+                               l1b = new byte [len * 2 + 10];
 
-                       if (l2b == null || l2b.Length < s.Length)
-                               l2b = new byte [s.Length + 10];
-                       if (l3b == null || l3b.Length < s.Length)
-                               l3b = new byte [s.Length + 10];
+                       if (processLevel2 && (l2b == null || l2b.Length < len))
+                               l2b = new byte [len + 10];
+                       if (l3b == null || l3b.Length < len)
+                               l3b = new byte [len + 10];
 
                        // This weight is used only in Japanese text.
                        // We could expand the initial length as well as
@@ -60,9 +102,25 @@ namespace Mono.Globalization.Unicode
                                l5b = new byte [10];
                }
 
+               internal void AppendCJKExtension (byte lv1msb, byte lv1lsb)
+               {
+                       AppendBufferPrimitive (0xFE, ref l1b, ref l1);
+                       AppendBufferPrimitive (0xFF, ref l1b, ref l1);
+                       AppendBufferPrimitive (lv1msb, ref l1b, ref l1);
+                       AppendBufferPrimitive (lv1lsb, ref l1b, ref l1);
+                       if (processLevel2)
+                               AppendBufferPrimitive (2, ref l2b, ref l2);
+                       AppendBufferPrimitive (2, ref l3b, ref l3);
+               }
+
+               // LAMESPEC: Windows handles some of Hangul Jamo as to have
+               // more than two primary weight values. However this causes
+               // incorrect zero-termination. So I just ignore them and
+               // treat it as usual character.
+               /*
                internal void AppendJamo (byte category, byte lv1msb, byte lv1lsb)
                {
-                       AppendNormal (category, lv1msb, lv2, lv3);
+                       AppendNormal (category, lv1msb, 0, 0);
                        AppendBufferPrimitive (0xFF, ref l1b, ref l1);
                        AppendBufferPrimitive (lv1lsb, ref l1b, ref l1);
                        AppendBufferPrimitive (0xFF, ref l1b, ref l1);
@@ -72,44 +130,76 @@ namespace Mono.Globalization.Unicode
                        AppendBufferPrimitive (0xFF, ref l1b, ref l1);
                        AppendBufferPrimitive (0, ref l1b, ref l1);
                }
+               */
 
                // Append sort key value from table normally.
-               internal void AppendKana (byte category, byte lv1, byte lv2, byte lv3, bool isSmallKana, byte kanaType, bool isKatakana, byte kanaWidth)
+               internal void AppendKana (byte category, byte lv1, byte lv2, byte lv3, bool isSmallKana, byte markType, bool isKatakana, bool isHalfWidth)
                {
                        AppendNormal (category, lv1, lv2, lv3);
 
-                       AppendBufferPrimitive (isSmallKana ? 0xC4 : 0xE4, ref l4sb, ref l4s);
-                       AppendBufferPrimitive (kanaType, ref l4tb, ref l4t);
-                       AppendBufferPrimitive (isKatakana ? 0xC4 : 0xE4, ref l4kb, ref l4k);
-                       AppendBufferPrimitive (kanaWidth, ref l4wb, ref l4w);
+                       AppendBufferPrimitive ((byte) (isSmallKana ? 0xC4 : 0xE4), ref l4sb, ref l4s);
+                       AppendBufferPrimitive (markType, ref l4tb, ref l4t);
+                       AppendBufferPrimitive ((byte) (isKatakana ? 0xC4 : 0xE4), ref l4kb, ref l4k);
+                       AppendBufferPrimitive ((byte) (isHalfWidth ? 0xC4 : 0xE4), ref l4wb, ref l4w);
                }
 
                // Append sort key value from table normally.
                internal void AppendNormal (byte category, byte lv1, byte lv2, byte lv3)
                {
-                       AppendBufferPrimitive (category, ref l1b, ref l1);
-                       AppendBufferPrimitive (lv1, ref l1b, ref l1);
-                       AppendBufferPrimitive (lv2, ref l2b, ref l2);
+                       if (lv2 == 0)
+                               lv2 = 2;
+                       if (lv3 == 0)
+                               lv3 = 2;
+
+                       // Special weight processing
+                       if (category == 6 && (options & CompareOptions.StringSort) == 0) {
+                               AppendLevel5 (category, lv1);
+                               return;
+                       }
+
+                       // non-primary diacritical weight is added to that of
+                       // the previous character (and does not reset level 3
+                       // weight).
+                       if (processLevel2 && category == 1 && l1 > 0) {
+                               lv2 = (byte) (lv2 + l2b [--l2]);
+                               lv3 = l3b [--l3];
+                       }
+
+                       if (category != 1) {
+                               AppendBufferPrimitive (category, ref l1b, ref l1);
+                               AppendBufferPrimitive (lv1, ref l1b, ref l1);
+                       }
+                       if (processLevel2)
+                               AppendBufferPrimitive (lv2, ref l2b, ref l2);
                        AppendBufferPrimitive (lv3, ref l3b, ref l3);
                }
 
                // Append variable-weight character.
-               internal void AppendLevel5 (byte [] table, int idx, int currentIndex)
+               // It uses level 2 index for counting offsets (since level1
+               // might be longer than 1).
+               private void AppendLevel5 (byte category, byte lv1)
                {
                        // offset
-                       int offsetValue = currentIndex - level5LastPos;
-                       for (; offsetValue > 8064; offsetValue -= 8064)
+#if false
+                       // If it strictly matches to Windows, offsetValue is always l2.
+                       int offsetValue = l2 - level5LastPos;
+                       // If it strictly matches ti Windows, no 0xFF here.
+                       for (; offsetValue > 8192; offsetValue -= 8192)
                                AppendBufferPrimitive (0xFF, ref l5b, ref l5);
-                       if (offsetValue > 63)
-                               AppendBufferPrimitive ((byte) (offsetValue - 63 / 4 + 0x80), ref l5b, ref l5);
-                       AppendBufferPrimitive ((byte) (offsetValue % 63), ref l5b, ref l5);
+#else
+                       // LAMESPEC: Windows cannot compute lv5 values for
+                       // those string that has length larger than 8064.
+                       // (It reminds me of SQL Server varchar length).
+                       int offsetValue = (l2 + 1) % 8192;
+#endif
+                       AppendBufferPrimitive ((byte) ((offsetValue / 64) + 0x80), ref l5b, ref l5);
+                       AppendBufferPrimitive ((byte) (offsetValue % 64 * 4 + 3), ref l5b, ref l5);
 
-                       level5LastPos = currentIndex;
+//                     level5LastPos = l2;
 
                        // sortkey value
-                       idx++; // skip the "variable" mark: 01
-                       while (table [idx] != 0)
-                               AppendBufferPrimitive (table [idx++], ref l5b, ref l5);
+                       AppendBufferPrimitive (category, ref l5b, ref l5);
+                       AppendBufferPrimitive (lv1, ref l5b, ref l5);
                }
 
                private void AppendBufferPrimitive (byte value, ref byte [] buf, ref int bidx)
@@ -122,9 +212,9 @@ namespace Mono.Globalization.Unicode
                        }
                }
 
-               public byte [] GetResultAndReset ()
+               public SortKey GetResultAndReset ()
                {
-                       byte [] ret = GetResult ();
+                       SortKey ret = GetResult ();
                        Reset ();
                        return ret;
                }
@@ -133,29 +223,42 @@ namespace Mono.Globalization.Unicode
                // 02 02 02 -> 0
                // 02 03 02 -> 2
                // 03 04 05 -> 3
-               private int GetOptimizedLength (byte [] data, int len)
+               private int GetOptimizedLength (byte [] data, int len, byte defaultValue)
                {
                        int cur = -1;
                        for (int i = 0; i < len; i++)
-                               if (data [i] != 2)
+                               if (data [i] != defaultValue)
                                        cur = i;
                        return cur + 1;
                }
 
-               public byte [] GetResult ()
+               public SortKey GetResult ()
                {
-                       l2 = GetOptimizedLength (l2b, l2);
-                       l3 = GetOptimizedLength (l3b, l3);
-                       l4s = GetOptimizedLength (l4sb, l4s);
-                       l4t = GetOptimizedLength (l4tb, l4t);
-                       l4k = GetOptimizedLength (l4kb, l4k);
-                       l4w = GetOptimizedLength (l4wb, l4w);
-                       l5 = GetOptimizedLength (l5b, l5);
+                       if (source.Length == 0)
+                               return new SortKey (lcid, source, new byte [0], options, 0, 0, 0, 0, 0, 0, 0, 0);
+
+                       if (frenchSort && !frenchSorted && l2b != null) {
+                               int i = 0;
+                               for (; i < l2b.Length; i++)
+                                       if (l2b [i] == 0)
+                                               break;
+                               Array.Reverse (l2b, 0, i);
+                               frenchSorted = true;
+                       }
+
+                       l2 = GetOptimizedLength (l2b, l2, 2);
+                       l3 = GetOptimizedLength (l3b, l3, 2);
+                       bool hasJapaneseWeight = (l4s > 0); // snapshot before being optimized
+                       l4s = GetOptimizedLength (l4sb, l4s, 0xE4);
+                       l4t = GetOptimizedLength (l4tb, l4t, 3);
+                       l4k = GetOptimizedLength (l4kb, l4k, 0xE4);
+                       l4w = GetOptimizedLength (l4wb, l4w, 0xE4);
+                       l5 = GetOptimizedLength (l5b, l5, 2);
 
                        int length = l1 + l2 + l3 + l5 + 5;
                        int jpLength = l4s + l4t + l4k + l4w;
-                       if (jpLength > 0)
-                               length += jpLength + 3;
+                       if (hasJapaneseWeight)
+                               length += jpLength + 4;
 
                        byte [] ret = new byte [length];
                        Array.Copy (l1b, ret, l1);
@@ -169,25 +272,26 @@ namespace Mono.Globalization.Unicode
                                Array.Copy (l3b, 0, ret, cur, l3);
                        cur += l3;
                        ret [cur++] = 1; // end-of-level mark
-                       if (jpLength > 0) {
-                               Array.Copy (l4s, 0, ret, cur, l4s);
+                       if (hasJapaneseWeight) {
+                               Array.Copy (l4sb, 0, ret, cur, l4s);
                                cur += l4s;
                                ret [cur++] = 0xFF; // end-of-jp-subsection
-                               Array.Copy (l4t, 0, ret, cur, l4t);
+                               Array.Copy (l4tb, 0, ret, cur, l4t);
                                cur += l4t;
                                ret [cur++] = 2; // end-of-jp-middle-subsection
-                               Array.Copy (l4k, 0, ret, cur, l4k);
+                               Array.Copy (l4kb, 0, ret, cur, l4k);
                                cur += l4k;
                                ret [cur++] = 0xFF; // end-of-jp-subsection
-                               Array.Copy (l4w, 0, ret, cur, l4w);
+                               Array.Copy (l4wb, 0, ret, cur, l4w);
                                cur += l4w;
+                               ret [cur++] = 0xFF; // end-of-jp-subsection
                        }
                        ret [cur++] = 1; // end-of-level mark
                        if (l5 > 0)
                                Array.Copy (l5b, 0, ret, cur, l5);
                        cur += l5;
                        ret [cur++] = 0; // end-of-data mark
-                       return ret;
+                       return new SortKey (lcid, source, ret, options, l1, l2, l3, l4s, l4t, l4k, l4w, l5);
                }
        }
 }