-/*\r
- Copyright (c) 2003-2006 Niels Kokholm and Peter Sestoft\r
- Permission is hereby granted, free of charge, to any person obtaining a copy\r
- of this software and associated documentation files (the "Software"), to deal\r
- in the Software without restriction, including without limitation the rights\r
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell\r
- copies of the Software, and to permit persons to whom the Software is\r
- furnished to do so, subject to the following conditions:\r
- \r
- The above copyright notice and this permission notice shall be included in\r
- all copies or substantial portions of the Software.\r
- \r
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\r
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\r
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE\r
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\r
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,\r
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE\r
- SOFTWARE.\r
-*/\r
-\r
-// C5 example: anagrams 2004-08-08, 2004-11-16\r
-\r
-// Compile with \r
-// csc /r:C5.dll Anagrams.cs \r
-\r
-using System;\r
-using System.IO; // StreamReader, TextReader\r
-using System.Text; // Encoding\r
-using System.Text.RegularExpressions; // Regex\r
-using C5;\r
-using SCG = System.Collections.Generic;\r
-\r
-namespace Anagrams\r
-{\r
- class MyTest\r
- {\r
- public static void Main(String[] args)\r
- {\r
- Console.OutputEncoding = Encoding.GetEncoding("iso-8859-1");\r
- SCG.IEnumerable<String> ss;\r
- if (args.Length == 2)\r
- ss = ReadFileWords(args[0], int.Parse(args[1]));\r
- else\r
- ss = args;\r
- // foreach (String s in FirstAnagramOnly(ss)) \r
- // Console.WriteLine(s);\r
- // Console.WriteLine("===");\r
- Timer t = new Timer();\r
- SCG.IEnumerable<SCG.IEnumerable<String>> classes = AnagramClasses(ss);\r
- int count = 0;\r
- foreach (SCG.IEnumerable<String> anagramClass in classes)\r
- {\r
- count++;\r
- // foreach (String s in anagramClass) \r
- // Console.Write(s + " ");\r
- // Console.WriteLine();\r
- }\r
- Console.WriteLine("{0} non-trivial anagram classes", count);\r
- Console.WriteLine(t.Check());\r
- }\r
-\r
- // Read words at most n words from a file\r
-\r
- public static SCG.IEnumerable<String> ReadFileWords(String filename, int n)\r
- {\r
- Regex delim = new Regex("[^a-zæøåA-ZÆØÅ0-9-]+");\r
- Encoding enc = Encoding.GetEncoding("iso-8859-1");\r
- using (TextReader rd = new StreamReader(filename, enc))\r
- {\r
- for (String line = rd.ReadLine(); line != null; line = rd.ReadLine())\r
- {\r
- foreach (String s in delim.Split(line))\r
- if (s != "")\r
- yield return s.ToLower();\r
- if (--n == 0)\r
- yield break;\r
- }\r
- }\r
- }\r
-\r
- // From an anagram point of view, a word is just a bag of\r
- // characters. So an anagram class is represented as TreeBag<char>\r
- // which permits fast equality comparison -- we shall use them as\r
- // elements of hash sets or keys in hash maps.\r
-\r
- public static TreeBag<char> AnagramClass(String s)\r
- {\r
- TreeBag<char> anagram = new TreeBag<char>(Comparer<char>.Default, EqualityComparer<char>.Default);\r
- foreach (char c in s)\r
- anagram.Add(c);\r
- return anagram;\r
- }\r
-\r
- // Given a sequence of strings, return only the first member of each\r
- // anagram class.\r
-\r
- public static SCG.IEnumerable<String> FirstAnagramOnly(SCG.IEnumerable<String> ss)\r
- {\r
- SCG.IEqualityComparer<TreeBag<char>> tbh\r
- = UnsequencedCollectionEqualityComparer<TreeBag<char>, char>.Default;\r
- HashSet<TreeBag<char>> anagrams = new HashSet<TreeBag<char>>(tbh);\r
- foreach (String s in ss)\r
- {\r
- TreeBag<char> anagram = AnagramClass(s);\r
- if (!anagrams.Contains(anagram))\r
- {\r
- anagrams.Add(anagram);\r
- yield return s;\r
- }\r
- }\r
- }\r
-\r
- // Given a sequence of strings, return all non-trivial anagram\r
- // classes. Should use a *sequenced* equalityComparer on a TreeBag<char>,\r
- // obviously: after all, characters can be sorted by ASCII code. On\r
- // 347 000 distinct Danish words this takes 70 cpu seconds, 180 MB\r
- // memory, and 263 wall-clock seconds (due to swapping).\r
-\r
- // Using a TreeBag<char> and a sequenced equalityComparer takes 82 cpu seconds\r
- // and 180 MB RAM to find the 26,058 anagram classes among 347,000\r
- // distinct words.\r
-\r
- // Using an unsequenced equalityComparer on TreeBag<char> or HashBag<char>\r
- // makes it criminally slow: at least 1200 cpu seconds. This must\r
- // be because many bags get the same hash code, so that there are\r
- // many collisions. But exactly how the unsequenced equalityComparer works is\r
- // not clear ... or is it because unsequenced equality is slow?\r
-\r
- public static SCG.IEnumerable<SCG.IEnumerable<String>> AnagramClasses(SCG.IEnumerable<String> ss)\r
- {\r
- bool unseq = true;\r
- IDictionary<TreeBag<char>, TreeSet<String>> classes;\r
- if (unseq)\r
- {\r
- SCG.IEqualityComparer<TreeBag<char>> unsequencedTreeBagEqualityComparer\r
- = UnsequencedCollectionEqualityComparer<TreeBag<char>, char>.Default;\r
- classes = new HashDictionary<TreeBag<char>, TreeSet<String>>(unsequencedTreeBagEqualityComparer);\r
- }\r
- else\r
- {\r
- SCG.IEqualityComparer<TreeBag<char>> sequencedTreeBagEqualityComparer\r
- = SequencedCollectionEqualityComparer<TreeBag<char>, char>.Default;\r
- classes = new HashDictionary<TreeBag<char>, TreeSet<String>>(sequencedTreeBagEqualityComparer);\r
- }\r
- foreach (String s in ss)\r
- {\r
- TreeBag<char> anagram = AnagramClass(s);\r
- TreeSet<String> anagramClass;\r
- if (!classes.Find(anagram, out anagramClass))\r
- classes[anagram] = anagramClass = new TreeSet<String>();\r
- anagramClass.Add(s);\r
- }\r
- foreach (TreeSet<String> anagramClass in classes.Values)\r
- if (anagramClass.Count > 1)\r
- yield return anagramClass;\r
- }\r
- }\r
-\r
-// Crude timing utility ----------------------------------------\r
-\r
- public class Timer\r
- {\r
- private DateTime start;\r
-\r
- public Timer()\r
- {\r
- start = DateTime.Now;\r
- }\r
-\r
- public double Check()\r
- {\r
- TimeSpan dur = DateTime.Now - start;\r
- return dur.TotalSeconds;\r
- }\r
- }\r
+/*
+ Copyright (c) 2003-2006 Niels Kokholm and Peter Sestoft
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ SOFTWARE.
+*/
+
+// C5 example: anagrams 2004-08-08, 2004-11-16
+
+// Compile with
+// csc /r:C5.dll Anagrams.cs
+
+using System;
+using System.IO; // StreamReader, TextReader
+using System.Text; // Encoding
+using System.Text.RegularExpressions; // Regex
+using C5;
+using SCG = System.Collections.Generic;
+
+namespace Anagrams
+{
+ class MyTest
+ {
+ public static void Main(String[] args)
+ {
+ Console.OutputEncoding = Encoding.GetEncoding("iso-8859-1");
+ SCG.IEnumerable<String> ss;
+ if (args.Length == 2)
+ ss = ReadFileWords(args[0], int.Parse(args[1]));
+ else
+ ss = args;
+ // foreach (String s in FirstAnagramOnly(ss))
+ // Console.WriteLine(s);
+ // Console.WriteLine("===");
+ Timer t = new Timer();
+ SCG.IEnumerable<SCG.IEnumerable<String>> classes = AnagramClasses(ss);
+ int count = 0;
+ foreach (SCG.IEnumerable<String> anagramClass in classes)
+ {
+ count++;
+ // foreach (String s in anagramClass)
+ // Console.Write(s + " ");
+ // Console.WriteLine();
+ }
+ Console.WriteLine("{0} non-trivial anagram classes", count);
+ Console.WriteLine(t.Check());
+ }
+
+ // Read words at most n words from a file
+
+ public static SCG.IEnumerable<String> ReadFileWords(String filename, int n)
+ {
+ Regex delim = new Regex("[^a-zæøåA-ZÆØÅ0-9-]+");
+ Encoding enc = Encoding.GetEncoding("iso-8859-1");
+ using (TextReader rd = new StreamReader(filename, enc))
+ {
+ for (String line = rd.ReadLine(); line != null; line = rd.ReadLine())
+ {
+ foreach (String s in delim.Split(line))
+ if (s != "")
+ yield return s.ToLower();
+ if (--n == 0)
+ yield break;
+ }
+ }
+ }
+
+ // From an anagram point of view, a word is just a bag of
+ // characters. So an anagram class is represented as TreeBag<char>
+ // which permits fast equality comparison -- we shall use them as
+ // elements of hash sets or keys in hash maps.
+
+ public static TreeBag<char> AnagramClass(String s)
+ {
+ TreeBag<char> anagram = new TreeBag<char>(Comparer<char>.Default, EqualityComparer<char>.Default);
+ foreach (char c in s)
+ anagram.Add(c);
+ return anagram;
+ }
+
+ // Given a sequence of strings, return only the first member of each
+ // anagram class.
+
+ public static SCG.IEnumerable<String> FirstAnagramOnly(SCG.IEnumerable<String> ss)
+ {
+ SCG.IEqualityComparer<TreeBag<char>> tbh
+ = UnsequencedCollectionEqualityComparer<TreeBag<char>, char>.Default;
+ HashSet<TreeBag<char>> anagrams = new HashSet<TreeBag<char>>(tbh);
+ foreach (String s in ss)
+ {
+ TreeBag<char> anagram = AnagramClass(s);
+ if (!anagrams.Contains(anagram))
+ {
+ anagrams.Add(anagram);
+ yield return s;
+ }
+ }
+ }
+
+ // Given a sequence of strings, return all non-trivial anagram
+ // classes. Should use a *sequenced* equalityComparer on a TreeBag<char>,
+ // obviously: after all, characters can be sorted by ASCII code. On
+ // 347 000 distinct Danish words this takes 70 cpu seconds, 180 MB
+ // memory, and 263 wall-clock seconds (due to swapping).
+
+ // Using a TreeBag<char> and a sequenced equalityComparer takes 82 cpu seconds
+ // and 180 MB RAM to find the 26,058 anagram classes among 347,000
+ // distinct words.
+
+ // Using an unsequenced equalityComparer on TreeBag<char> or HashBag<char>
+ // makes it criminally slow: at least 1200 cpu seconds. This must
+ // be because many bags get the same hash code, so that there are
+ // many collisions. But exactly how the unsequenced equalityComparer works is
+ // not clear ... or is it because unsequenced equality is slow?
+
+ public static SCG.IEnumerable<SCG.IEnumerable<String>> AnagramClasses(SCG.IEnumerable<String> ss)
+ {
+ bool unseq = true;
+ IDictionary<TreeBag<char>, TreeSet<String>> classes;
+ if (unseq)
+ {
+ SCG.IEqualityComparer<TreeBag<char>> unsequencedTreeBagEqualityComparer
+ = UnsequencedCollectionEqualityComparer<TreeBag<char>, char>.Default;
+ classes = new HashDictionary<TreeBag<char>, TreeSet<String>>(unsequencedTreeBagEqualityComparer);
+ }
+ else
+ {
+ SCG.IEqualityComparer<TreeBag<char>> sequencedTreeBagEqualityComparer
+ = SequencedCollectionEqualityComparer<TreeBag<char>, char>.Default;
+ classes = new HashDictionary<TreeBag<char>, TreeSet<String>>(sequencedTreeBagEqualityComparer);
+ }
+ foreach (String s in ss)
+ {
+ TreeBag<char> anagram = AnagramClass(s);
+ TreeSet<String> anagramClass;
+ if (!classes.Find(anagram, out anagramClass))
+ classes[anagram] = anagramClass = new TreeSet<String>();
+ anagramClass.Add(s);
+ }
+ foreach (TreeSet<String> anagramClass in classes.Values)
+ if (anagramClass.Count > 1)
+ yield return anagramClass;
+ }
+ }
+
+// Crude timing utility ----------------------------------------
+
+ public class Timer
+ {
+ private DateTime start;
+
+ public Timer()
+ {
+ start = DateTime.Now;
+ }
+
+ public double Check()
+ {
+ TimeSpan dur = DateTime.Now - start;
+ return dur.TotalSeconds;
+ }
+ }