X-Git-Url: http://wien.tomnetworks.com/gitweb/?a=blobdiff_plain;f=mcs%2Fclass%2FMono.C5%2FUserGuideExamples%2FAnagrams.cs;h=018d7b361084737455441bb4e5f0dc1e96e3d716;hb=39fac8ab1c15fb397ad8e29cc300c5185bdc7dfc;hp=62cfbf002ffb475e789fd1340488551c964ebe41;hpb=f99ce750ee781a2584e849a0264300fa4d99aaaa;p=mono.git diff --git a/mcs/class/Mono.C5/UserGuideExamples/Anagrams.cs b/mcs/class/Mono.C5/UserGuideExamples/Anagrams.cs index 62cfbf002ff..018d7b36108 100644 --- a/mcs/class/Mono.C5/UserGuideExamples/Anagrams.cs +++ b/mcs/class/Mono.C5/UserGuideExamples/Anagrams.cs @@ -1,177 +1,177 @@ -/* - Copyright (c) 2003-2006 Niels Kokholm and Peter Sestoft - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in - all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -// C5 example: anagrams 2004-08-08, 2004-11-16 - -// Compile with -// csc /r:C5.dll Anagrams.cs - -using System; -using System.IO; // StreamReader, TextReader -using System.Text; // Encoding -using System.Text.RegularExpressions; // Regex -using C5; -using SCG = System.Collections.Generic; - -namespace Anagrams -{ - class MyTest - { - public static void Main(String[] args) - { - Console.OutputEncoding = Encoding.GetEncoding("iso-8859-1"); - SCG.IEnumerable ss; - if (args.Length == 2) - ss = ReadFileWords(args[0], int.Parse(args[1])); - else - ss = args; - // foreach (String s in FirstAnagramOnly(ss)) - // Console.WriteLine(s); - // Console.WriteLine("==="); - Timer t = new Timer(); - SCG.IEnumerable> classes = AnagramClasses(ss); - int count = 0; - foreach (SCG.IEnumerable anagramClass in classes) - { - count++; - // foreach (String s in anagramClass) - // Console.Write(s + " "); - // Console.WriteLine(); - } - Console.WriteLine("{0} non-trivial anagram classes", count); - Console.WriteLine(t.Check()); - } - - // Read words at most n words from a file - - public static SCG.IEnumerable ReadFileWords(String filename, int n) - { - Regex delim = new Regex("[^a-zæøåA-ZÆØÅ0-9-]+"); - Encoding enc = Encoding.GetEncoding("iso-8859-1"); - using (TextReader rd = new StreamReader(filename, enc)) - { - for (String line = rd.ReadLine(); line != null; line = rd.ReadLine()) - { - foreach (String s in delim.Split(line)) - if (s != "") - yield return s.ToLower(); - if (--n == 0) - yield break; - } - } - } - - // From an anagram point of view, a word is just a bag of - // characters. So an anagram class is represented as TreeBag - // which permits fast equality comparison -- we shall use them as - // elements of hash sets or keys in hash maps. - - public static TreeBag AnagramClass(String s) - { - TreeBag anagram = new TreeBag(Comparer.Default, EqualityComparer.Default); - foreach (char c in s) - anagram.Add(c); - return anagram; - } - - // Given a sequence of strings, return only the first member of each - // anagram class. - - public static SCG.IEnumerable FirstAnagramOnly(SCG.IEnumerable ss) - { - SCG.IEqualityComparer> tbh - = UnsequencedCollectionEqualityComparer, char>.Default; - HashSet> anagrams = new HashSet>(tbh); - foreach (String s in ss) - { - TreeBag anagram = AnagramClass(s); - if (!anagrams.Contains(anagram)) - { - anagrams.Add(anagram); - yield return s; - } - } - } - - // Given a sequence of strings, return all non-trivial anagram - // classes. Should use a *sequenced* equalityComparer on a TreeBag, - // obviously: after all, characters can be sorted by ASCII code. On - // 347 000 distinct Danish words this takes 70 cpu seconds, 180 MB - // memory, and 263 wall-clock seconds (due to swapping). - - // Using a TreeBag and a sequenced equalityComparer takes 82 cpu seconds - // and 180 MB RAM to find the 26,058 anagram classes among 347,000 - // distinct words. - - // Using an unsequenced equalityComparer on TreeBag or HashBag - // makes it criminally slow: at least 1200 cpu seconds. This must - // be because many bags get the same hash code, so that there are - // many collisions. But exactly how the unsequenced equalityComparer works is - // not clear ... or is it because unsequenced equality is slow? - - public static SCG.IEnumerable> AnagramClasses(SCG.IEnumerable ss) - { - bool unseq = true; - IDictionary, TreeSet> classes; - if (unseq) - { - SCG.IEqualityComparer> unsequencedTreeBagEqualityComparer - = UnsequencedCollectionEqualityComparer, char>.Default; - classes = new HashDictionary, TreeSet>(unsequencedTreeBagEqualityComparer); - } - else - { - SCG.IEqualityComparer> sequencedTreeBagEqualityComparer - = SequencedCollectionEqualityComparer, char>.Default; - classes = new HashDictionary, TreeSet>(sequencedTreeBagEqualityComparer); - } - foreach (String s in ss) - { - TreeBag anagram = AnagramClass(s); - TreeSet anagramClass; - if (!classes.Find(anagram, out anagramClass)) - classes[anagram] = anagramClass = new TreeSet(); - anagramClass.Add(s); - } - foreach (TreeSet anagramClass in classes.Values) - if (anagramClass.Count > 1) - yield return anagramClass; - } - } - -// Crude timing utility ---------------------------------------- - - public class Timer - { - private DateTime start; - - public Timer() - { - start = DateTime.Now; - } - - public double Check() - { - TimeSpan dur = DateTime.Now - start; - return dur.TotalSeconds; - } - } +/* + Copyright (c) 2003-2006 Niels Kokholm and Peter Sestoft + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + SOFTWARE. +*/ + +// C5 example: anagrams 2004-08-08, 2004-11-16 + +// Compile with +// csc /r:C5.dll Anagrams.cs + +using System; +using System.IO; // StreamReader, TextReader +using System.Text; // Encoding +using System.Text.RegularExpressions; // Regex +using C5; +using SCG = System.Collections.Generic; + +namespace Anagrams +{ + class MyTest + { + public static void Main(String[] args) + { + Console.OutputEncoding = Encoding.GetEncoding("iso-8859-1"); + SCG.IEnumerable ss; + if (args.Length == 2) + ss = ReadFileWords(args[0], int.Parse(args[1])); + else + ss = args; + // foreach (String s in FirstAnagramOnly(ss)) + // Console.WriteLine(s); + // Console.WriteLine("==="); + Timer t = new Timer(); + SCG.IEnumerable> classes = AnagramClasses(ss); + int count = 0; + foreach (SCG.IEnumerable anagramClass in classes) + { + count++; + // foreach (String s in anagramClass) + // Console.Write(s + " "); + // Console.WriteLine(); + } + Console.WriteLine("{0} non-trivial anagram classes", count); + Console.WriteLine(t.Check()); + } + + // Read words at most n words from a file + + public static SCG.IEnumerable ReadFileWords(String filename, int n) + { + Regex delim = new Regex("[^a-zæøåA-ZÆØÅ0-9-]+"); + Encoding enc = Encoding.GetEncoding("iso-8859-1"); + using (TextReader rd = new StreamReader(filename, enc)) + { + for (String line = rd.ReadLine(); line != null; line = rd.ReadLine()) + { + foreach (String s in delim.Split(line)) + if (s != "") + yield return s.ToLower(); + if (--n == 0) + yield break; + } + } + } + + // From an anagram point of view, a word is just a bag of + // characters. So an anagram class is represented as TreeBag + // which permits fast equality comparison -- we shall use them as + // elements of hash sets or keys in hash maps. + + public static TreeBag AnagramClass(String s) + { + TreeBag anagram = new TreeBag(Comparer.Default, EqualityComparer.Default); + foreach (char c in s) + anagram.Add(c); + return anagram; + } + + // Given a sequence of strings, return only the first member of each + // anagram class. + + public static SCG.IEnumerable FirstAnagramOnly(SCG.IEnumerable ss) + { + SCG.IEqualityComparer> tbh + = UnsequencedCollectionEqualityComparer, char>.Default; + HashSet> anagrams = new HashSet>(tbh); + foreach (String s in ss) + { + TreeBag anagram = AnagramClass(s); + if (!anagrams.Contains(anagram)) + { + anagrams.Add(anagram); + yield return s; + } + } + } + + // Given a sequence of strings, return all non-trivial anagram + // classes. Should use a *sequenced* equalityComparer on a TreeBag, + // obviously: after all, characters can be sorted by ASCII code. On + // 347 000 distinct Danish words this takes 70 cpu seconds, 180 MB + // memory, and 263 wall-clock seconds (due to swapping). + + // Using a TreeBag and a sequenced equalityComparer takes 82 cpu seconds + // and 180 MB RAM to find the 26,058 anagram classes among 347,000 + // distinct words. + + // Using an unsequenced equalityComparer on TreeBag or HashBag + // makes it criminally slow: at least 1200 cpu seconds. This must + // be because many bags get the same hash code, so that there are + // many collisions. But exactly how the unsequenced equalityComparer works is + // not clear ... or is it because unsequenced equality is slow? + + public static SCG.IEnumerable> AnagramClasses(SCG.IEnumerable ss) + { + bool unseq = true; + IDictionary, TreeSet> classes; + if (unseq) + { + SCG.IEqualityComparer> unsequencedTreeBagEqualityComparer + = UnsequencedCollectionEqualityComparer, char>.Default; + classes = new HashDictionary, TreeSet>(unsequencedTreeBagEqualityComparer); + } + else + { + SCG.IEqualityComparer> sequencedTreeBagEqualityComparer + = SequencedCollectionEqualityComparer, char>.Default; + classes = new HashDictionary, TreeSet>(sequencedTreeBagEqualityComparer); + } + foreach (String s in ss) + { + TreeBag anagram = AnagramClass(s); + TreeSet anagramClass; + if (!classes.Find(anagram, out anagramClass)) + classes[anagram] = anagramClass = new TreeSet(); + anagramClass.Add(s); + } + foreach (TreeSet anagramClass in classes.Values) + if (anagramClass.Count > 1) + yield return anagramClass; + } + } + +// Crude timing utility ---------------------------------------- + + public class Timer + { + private DateTime start; + + public Timer() + { + start = DateTime.Now; + } + + public double Check() + { + TimeSpan dur = DateTime.Now - start; + return dur.TotalSeconds; + } + } } \ No newline at end of file