2005-04-26 Atsushi Enomoto <atsushi@ximian.com>

author Atsushi Eno <atsushieno@gmail.com>

Tue, 26 Apr 2005 08:45:03 +0000 (08:45 -0000)

committer Atsushi Eno <atsushieno@gmail.com>

Tue, 26 Apr 2005 08:45:03 +0000 (08:45 -0000)
author Atsushi Eno <atsushieno@gmail.com>
Tue, 26 Apr 2005 08:45:03 +0000 (08:45 -0000)
committer Atsushi Eno <atsushieno@gmail.com>
Tue, 26 Apr 2005 08:45:03 +0000 (08:45 -0000)
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog

index 68d5003138c351185f1e815a2db247f19e740f7a..55854cb98ff959dc3fe7e82acece8ffb1a78c2c1 100644 (file)
--- a/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog
@@ -1,3 +1,10 @@
+2005-04-26  Atsushi Enomoto  <atsushi@ximian.com>
+
+       * Collation-notes.txt : some updates.
+       * create-mapping-char-source.cs : superscripts and subscripts are also
+         ignored in IgnoreWidth comparison.
+       * Makefile : tiny touch fix.
+
  2005-04-25  Atsushi Enomoto  <atsushi@ximian.com>
  
         * CompareInfoImpl.cs, Collator.cs : conceptual stuff (not working).
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt b/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt

index ef2e96d0507e80f070d0439536123da41572564c..65a30763c2828e96dedd2f02f815a3a439c1b892 100644 (file)
--- a/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt
@@ -1,10 +1,15 @@
  String collation
  
+* Summary
+
+       We are going to implement Windows-like collation, apart from ICU which
+       is conformant to Unicode specifications.
+
  * CompareInfo members
  
         GetSortKey()
                 Compute sort key for every characters into byte[].
-               Use collation element table.
+               Use collation element table, but Windows specific ones.
         Compare()
                 Find first difference and compare it. "Larger/smaller" matters.
         IsPrefix()
@@ -34,8 +39,7 @@ String collation
         not distinguished, but Hiragana "A" and Hiragana "I" are.
  
         Actually, even without any IgnoreXXX flags (i.e. "None"), there are 
-       many characters that are excluded from results. I'd say it as
-       "completely ignorable" characters (as said in UCA).
+       many characters that are ignored ("completely ignorable").
  
         For LCID 101/1125(div), '\ufdf2' is completely ignorable.
         This rule even applies to CompareOptions.None.
@@ -49,10 +53,10 @@ String collation
                 and IgnoreCase, I\u0307 is not regarded as equal to i.
  
         IgnoreKanaType
-               We need ToHiragana() like ToLower(). See also "Notes".
+               ToKanaTypeInsensitive(). See also "Notes".
  
         IgnoreWidth
-               We need ToFullWidth(), which is likely to be culture
+               ToWidthInsensitive(), which is likely to be culture
                 independent. See also "Notes".
  
  ** Strippers
@@ -61,15 +65,15 @@ String collation
         compatible (at least with .NET 1.1 invariant culture).
  
         IgnoreNonSpace
-               It is in a black box.
-               - Some Diacritic characters are covered by this flag.
+               IsIgnorableNonSpacing().
+               Some Diacritic characters are covered by this flag.
  
                 There are some culture *dependent* characters:
                         LCID 90/1114(syr) : 64b, 652, 670
  
         IgnoreSymbols
-               We need to implement our own Char.IsSymbol().
-               UnicodeCategory does not work.
+               IsIgnorableSymbol().
+               UnicodeCategory does not work here.
  
                 There are some culture *dependent* characters:
                         LCID 17/1041(ja) : 2015
@@ -88,16 +92,20 @@ String collation
  
  * Collation element table tailoring
  
+       Deprecated; We won't use collation element table from unicode.org.
+
         We will contain only the default element table and Chinese table.
         (Japanese might be added too, since CLDR contains a large table for it)
  
-       Other rules are always "evaluated"; no physical expansion is done.
+       Other rules are always "evaluated"; no physical expansion is done to
+       the table loaded in memory (it's too wasting).
  
  * Notes
  
         Since UCA Level 3 handles both casing and width, it is impossible to
         use UCA variables for IgnoreWidth, at least with the default element
-       table.
+       table. And IgnoreKanaType cannot be handled without case and width
+       insensitivity.
  
         IgnoreWidth/IgnoreSymbols is processed after Kana voice mark
         decomposition (NFD).
@@ -109,3 +117,101 @@ String collation
         Myanmar, Mongolian, Cherokee, Etiopic, Tagalog, Khmer, are regarded as
         "completely ignorable".
  
+* MS collation design inference
+
+** sort key format
+
+       00 means the end of sort key.
+       01 means the end of the level.
+       02-FF means the value.
+
+       There are 5 levels.
+
+       - level 1: primary difference
+         The first byte of level 1 means the category of the character.
+       - level 2: case sensitivity
+       - level 3: diacritic difference
+       - level 4: kana type (mostly at primary category 22)
+       - level 5: control characters etc.
+
+** default
+
+       So the problem is, how to detect diacritic. Maybe they are combined
+       similarly to what is specified in UCA.
+
+*** sort order categories
+
+       1 (0) specially ignored ones (Japanese, Tamil, Thai)
+
+       3099-309C, BCD, E47, E4C, FF9E, FF9F
+
+       2 (1) maybe nonspacing marks
+
+       2.1 control characters (specified as such in Unicode), except for
+       whitespaces (0009-000D).
+
+       2.2 0027,FF07 (')
+
+       2.3  minus sign, hyphen, dash
+         minus signs: FE63, 207B (super), 208B (sub), 002D, 00FD (full-width)
+         hyphens: 00AD (soft), 2010, 2011 (nonbreaking) ... Unicode HYPHEN?
+         dashes, horizontal bars: FE58 ... Unicode DASH?
+
+       2.4 Arabic spacing and equivalents (64B-651, FE70-FE7F)
+         They are part of nonspacing mark, but not equal.
+
+       2.5 Nonspacing marks mixed
+         30D, 591-5C2, Mn:981-A3C, A4D, A70, A71, ABC, ABD ...
+
+       3 (7) space separators and some kind of marks
+
+       3.1 whitespaces, paragraph separator etc.
+
+       3.2 other marks ('!', '^', ...)
+
+       4 (8) mathmatical symbols
+
+       5 (9) some other symbols
+
+       6 (A) punctuations
+
+       7 (C) numbers
+
+       8 (E) latin letters (alphabets)
+
+       9 (F) greek letters
+
+       ...
+
+          (21) georgian letters
+
+       13 (22) japanese kana letters and symbols
+
+       14 (23) bopomofo letters
+
+       15 (24) syriac letters
+
+       16 (41-45) surrogate Pt.1
+
+       17 (52-7E) hangul
+
+       18 (9E-FE) CJK (kangxi etc.), PrivateUse mixed, surrogate Pt.2
+
+       19 (FE) CJK extensions (3400-)
+
+       20 (FF) Some supplemental Japanese/Arabic marks
+
+** Traditional Spanish
+
+       It has some combined characters as a unique character (like 'll').
+
+** Czech
+
+       Invariant culture also puts Czech unique character \u0161 between s
+       and t, unlike described here:
+       http://www.microsoft.com/globaldev/dis_v1/disv1.asp?DID=dis33d&File=S24C0.asp
+
+** Other locales
+
+       There are some character reorderings.
+
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/Makefile b/mcs/class/corlib/Mono.Globalization.Unicode/Makefile

index 90eb5ac73e59f0d8dae613b66d124f3b09c270e6..ab7dd98b6e129b1c020183a8960a9aaed5ce4914 100644 (file)
--- a/mcs/class/corlib/Mono.Globalization.Unicode/Makefile
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/Makefile
@@ -94,7 +94,7 @@ $(CB_CLASS_TABLE) :
         if ! test -d UCD/extracted; then mkdir UCD/extracted; fi
         wget http://www.unicode.org/Public/UNIDATA/extracted/DerivedCombiningClass.txt
         mv DerivedCombiningClass.txt UCD/extracted/
-       touch UCD/DerivedCombiningClass.txt
+       touch UCD/extracted/DerivedCombiningClass.txt
  
  
  sample : 
diff --git a/mcs/class/corlib/Mono.Globalization.Unicode/create-char-mapping-source.cs b/mcs/class/corlib/Mono.Globalization.Unicode/create-char-mapping-source.cs

index a3656d1856b33819d861f958aa7c666c3424e01a..7916b3f752da0d2353a6bb51d4d38f2bf4ca7aa5 100644 (file)
--- a/mcs/class/corlib/Mono.Globalization.Unicode/create-char-mapping-source.cs
+++ b/mcs/class/corlib/Mono.Globalization.Unicode/create-char-mapping-source.cs
@@ -224,6 +224,8 @@ namespace Mono.Globalization.Unicode
                                         switch (combiningCategory) {
                                         case "narrow":
                                         case "wide":
+                                       case "super":
+                                       case "sub":
                                                 widthSensitives.Add (cp);
                                                 break;
                                         }
author	Atsushi Eno <atsushieno@gmail.com>
	Tue, 26 Apr 2005 08:45:03 +0000 (08:45 -0000)
committer	Atsushi Eno <atsushieno@gmail.com>
	Tue, 26 Apr 2005 08:45:03 +0000 (08:45 -0000)
mcs/class/corlib/Mono.Globalization.Unicode/ChangeLog		patch \| blob \| history
mcs/class/corlib/Mono.Globalization.Unicode/Collation-notes.txt		patch \| blob \| history
mcs/class/corlib/Mono.Globalization.Unicode/Makefile		patch \| blob \| history
mcs/class/corlib/Mono.Globalization.Unicode/create-char-mapping-source.cs		patch \| blob \| history