2005-04-26 Atsushi Enomoto <atsushi@ximian.com>

author Atsushi Eno <atsushieno@gmail.com>

Tue, 26 Apr 2005 08:45:03 +0000 (08:45 -0000)

committer Atsushi Eno <atsushieno@gmail.com>

Tue, 26 Apr 2005 08:45:03 +0000 (08:45 -0000)
author Atsushi Eno <atsushieno@gmail.com>
Tue, 26 Apr 2005 08:45:03 +0000 (08:45 -0000)
committer Atsushi Eno <atsushieno@gmail.com>
Tue, 26 Apr 2005 08:45:03 +0000 (08:45 -0000)
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog

index 68d5003138c351185f1e815a2db247f19e740f7a..55854cb98ff959dc3fe7e82acece8ffb1a78c2c1 100644 (file)
--- a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
@@ -1,3 +1,10 @@
+2005-04-26  Atsushi Enomoto  <atsushi@ximian.com>
+
+       * Collation-notes.txt : some updates.
+       * create-mapping-char-source.cs : superscripts and subscripts are also
+         ignored in IgnoreWidth comparison.
+       * Makefile : tiny touch fix.
+
  2005-04-25  Atsushi Enomoto  <atsushi@ximian.com>
  
         * CompareInfoImpl.cs, Collator.cs : conceptual stuff (not working).
  2005-04-25  Atsushi Enomoto  <atsushi@ximian.com>
  
         * CompareInfoImpl.cs, Collator.cs : conceptual stuff (not working).
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt b/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt

index ef2e96d0507e80f070d0439536123da41572564c..65a30763c2828e96dedd2f02f815a3a439c1b892 100644 (file)
--- a/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt
@@ -1,10 +1,15 @@
  String collation
  
  String collation
  
+* Summary
+
+       We are going to implement Windows-like collation, apart from ICU which
+       is conformant to Unicode specifications.
+
  * CompareInfo members
  
         GetSortKey()
                 Compute sort key for every characters into byte[].
  * CompareInfo members
  
         GetSortKey()
                 Compute sort key for every characters into byte[].
-               Use collation element table.
+               Use collation element table, but Windows specific ones.
         Compare()
                 Find first difference and compare it. "Larger/smaller" matters.
         IsPrefix()
         Compare()
                 Find first difference and compare it. "Larger/smaller" matters.
         IsPrefix()
@@ -34,8 +39,7 @@ String collation
         not distinguished, but Hiragana "A" and Hiragana "I" are.
  
         Actually, even without any IgnoreXXX flags (i.e. "None"), there are 
         not distinguished, but Hiragana "A" and Hiragana "I" are.
  
         Actually, even without any IgnoreXXX flags (i.e. "None"), there are 
-       many characters that are excluded from results. I'd say it as
-       "completely ignorable" characters (as said in UCA).
+       many characters that are ignored ("completely ignorable").
  
         For LCID 101/1125(div), '\ufdf2' is completely ignorable.
         This rule even applies to CompareOptions.None.
  
         For LCID 101/1125(div), '\ufdf2' is completely ignorable.
         This rule even applies to CompareOptions.None.
@@ -49,10 +53,10 @@ String collation
                 and IgnoreCase, I\u0307 is not regarded as equal to i.
  
         IgnoreKanaType
                 and IgnoreCase, I\u0307 is not regarded as equal to i.
  
         IgnoreKanaType
-               We need ToHiragana() like ToLower(). See also "Notes".
+               ToKanaTypeInsensitive(). See also "Notes".
  
         IgnoreWidth
  
         IgnoreWidth
-               We need ToFullWidth(), which is likely to be culture
+               ToWidthInsensitive(), which is likely to be culture
                 independent. See also "Notes".
  
  ** Strippers
                 independent. See also "Notes".
  
  ** Strippers
@@ -61,15 +65,15 @@ String collation
         compatible (at least with .NET 1.1 invariant culture).
  
         IgnoreNonSpace
         compatible (at least with .NET 1.1 invariant culture).
  
         IgnoreNonSpace
-               It is in a black box.
-               - Some Diacritic characters are covered by this flag.
+               IsIgnorableNonSpacing().
+               Some Diacritic characters are covered by this flag.
  
                 There are some culture *dependent* characters:
                         LCID 90/1114(syr) : 64b, 652, 670
  
         IgnoreSymbols
  
                 There are some culture *dependent* characters:
                         LCID 90/1114(syr) : 64b, 652, 670
  
         IgnoreSymbols
-               We need to implement our own Char.IsSymbol().
-               UnicodeCategory does not work.
+               IsIgnorableSymbol().
+               UnicodeCategory does not work here.
  
                 There are some culture *dependent* characters:
                         LCID 17/1041(ja) : 2015
  
                 There are some culture *dependent* characters:
                         LCID 17/1041(ja) : 2015
@@ -88,16 +92,20 @@ String collation
  
  * Collation element table tailoring
  
  
  * Collation element table tailoring
  
+       Deprecated; We won't use collation element table from unicode.org.
+
         We will contain only the default element table and Chinese table.
         (Japanese might be added too, since CLDR contains a large table for it)
  
         We will contain only the default element table and Chinese table.
         (Japanese might be added too, since CLDR contains a large table for it)
  
-       Other rules are always "evaluated"; no physical expansion is done.
+       Other rules are always "evaluated"; no physical expansion is done to
+       the table loaded in memory (it's too wasting).
  
  * Notes
  
         Since UCA Level 3 handles both casing and width, it is impossible to
         use UCA variables for IgnoreWidth, at least with the default element
  
  * Notes
  
         Since UCA Level 3 handles both casing and width, it is impossible to
         use UCA variables for IgnoreWidth, at least with the default element
-       table.
+       table. And IgnoreKanaType cannot be handled without case and width
+       insensitivity.
  
         IgnoreWidth/IgnoreSymbols is processed after Kana voice mark
         decomposition (NFD).
  
         IgnoreWidth/IgnoreSymbols is processed after Kana voice mark
         decomposition (NFD).
@@ -109,3 +117,101 @@ String collation
         Myanmar, Mongolian, Cherokee, Etiopic, Tagalog, Khmer, are regarded as
         "completely ignorable".
  
         Myanmar, Mongolian, Cherokee, Etiopic, Tagalog, Khmer, are regarded as
         "completely ignorable".
  
+* MS collation design inference
+
+** sort key format
+
+       00 means the end of sort key.
+       01 means the end of the level.
+       02-FF means the value.
+
+       There are 5 levels.
+
+       - level 1: primary difference
+         The first byte of level 1 means the category of the character.
+       - level 2: case sensitivity
+       - level 3: diacritic difference
+       - level 4: kana type (mostly at primary category 22)
+       - level 5: control characters etc.
+
+** default
+
+       So the problem is, how to detect diacritic. Maybe they are combined
+       similarly to what is specified in UCA.
+
+*** sort order categories
+
+       1 (0) specially ignored ones (Japanese, Tamil, Thai)
+
+       3099-309C, BCD, E47, E4C, FF9E, FF9F
+
+       2 (1) maybe nonspacing marks
+
+       2.1 control characters (specified as such in Unicode), except for
+       whitespaces (0009-000D).
+
+       2.2 0027,FF07 (')
+
+       2.3  minus sign, hyphen, dash
+         minus signs: FE63, 207B (super), 208B (sub), 002D, 00FD (full-width)
+         hyphens: 00AD (soft), 2010, 2011 (nonbreaking) ... Unicode HYPHEN?
+         dashes, horizontal bars: FE58 ... Unicode DASH?
+
+       2.4 Arabic spacing and equivalents (64B-651, FE70-FE7F)
+         They are part of nonspacing mark, but not equal.
+
+       2.5 Nonspacing marks mixed
+         30D, 591-5C2, Mn:981-A3C, A4D, A70, A71, ABC, ABD ...
+
+       3 (7) space separators and some kind of marks
+
+       3.1 whitespaces, paragraph separator etc.
+
+       3.2 other marks ('!', '^', ...)
+
+       4 (8) mathmatical symbols
+
+       5 (9) some other symbols
+
+       6 (A) punctuations
+
+       7 (C) numbers
+
+       8 (E) latin letters (alphabets)
+
+       9 (F) greek letters
+
+       ...
+
+          (21) georgian letters
+
+       13 (22) japanese kana letters and symbols
+
+       14 (23) bopomofo letters
+
+       15 (24) syriac letters
+
+       16 (41-45) surrogate Pt.1
+
+       17 (52-7E) hangul
+
+       18 (9E-FE) CJK (kangxi etc.), PrivateUse mixed, surrogate Pt.2
+
+       19 (FE) CJK extensions (3400-)
+
+       20 (FF) Some supplemental Japanese/Arabic marks
+
+** Traditional Spanish
+
+       It has some combined characters as a unique character (like 'll').
+
+** Czech
+
+       Invariant culture also puts Czech unique character \u0161 between s
+       and t, unlike described here:
+       http://www.microsoft.com/globaldev/dis_v1/disv1.asp?DID=dis33d&File=S24C0.asp
+
+** Other locales
+
+       There are some character reorderings.
+
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/Makefile b/mcs/class/corlib/Mono.Globalization.Unicode/Makefile

index 90eb5ac73e59f0d8dae613b66d124f3b09c270e6..ab7dd98b6e129b1c020183a8960a9aaed5ce4914 100644 (file)
--- a/mcs/class/corlib/Mono.Globalization.Unicode/Makefile
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/Makefile
@@ -94,7 +94,7 @@ $(CB_CLASS_TABLE) :
         if ! test -d UCD/extracted; then mkdir UCD/extracted; fi
         wget http://www.unicode.org/Public/UNIDATA/extracted/DerivedCombiningClass.txt
         mv DerivedCombiningClass.txt UCD/extracted/
         if ! test -d UCD/extracted; then mkdir UCD/extracted; fi
         wget http://www.unicode.org/Public/UNIDATA/extracted/DerivedCombiningClass.txt
         mv DerivedCombiningClass.txt UCD/extracted/
-       touch UCD/DerivedCombiningClass.txt
+       touch UCD/extracted/DerivedCombiningClass.txt
  
  
  sample : 
  
  
  sample : 
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/create-char-mapping-source.cs b/mcs/class/corlib/Mono.Globalization.Unicode/create-char-mapping-source.cs

index a3656d1856b33819d861f958aa7c666c3424e01a..7916b3f752da0d2353a6bb51d4d38f2bf4ca7aa5 100644 (file)
--- a/mcs/class/corlib/Mono.Globalization.Unicode/create-char-mapping-source.cs
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/create-char-mapping-source.cs
@@ -224,6 +224,8 @@ namespace Mono.Globalization.Unicode
                                         switch (combiningCategory) {
                                         case "narrow":
                                         case "wide":
                                         switch (combiningCategory) {
                                         case "narrow":
                                         case "wide":
+                                       case "super":
+                                       case "sub":
                                                 widthSensitives.Add (cp);
                                                 break;
                                         }
                                                 widthSensitives.Add (cp);
                                                 break;
                                         }
author	Atsushi Eno <atsushieno@gmail.com>
	Tue, 26 Apr 2005 08:45:03 +0000 (08:45 -0000)
committer	Atsushi Eno <atsushieno@gmail.com>
	Tue, 26 Apr 2005 08:45:03 +0000 (08:45 -0000)
mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog		patch \| blob \| history
mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt		patch \| blob \| history
mcs/class/corlib/Mono.Globalization.Unicode/Makefile		patch \| blob \| history
mcs/class/corlib/Mono.Globalization.Unicode/create-char-mapping-source.cs		patch \| blob \| history