5 // Copyright (c) 2007-2008 Jiri Moudry, Pascal Craponne
\r
7 // Permission is hereby granted, free of charge, to any person obtaining a copy
\r
8 // of this software and associated documentation files (the "Software"), to deal
\r
9 // in the Software without restriction, including without limitation the rights
\r
10 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
\r
11 // copies of the Software, and to permit persons to whom the Software is
\r
12 // furnished to do so, subject to the following conditions:
\r
14 // The above copyright notice and this permission notice shall be included in
\r
15 // all copies or substantial portions of the Software.
\r
17 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
\r
18 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
\r
19 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
\r
20 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
\r
21 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
\r
22 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
\r
28 using System.Collections.Generic;
\r
29 using System.Globalization;
\r
32 namespace DbLinq.Language.Implementation
\r
35 /// Offer base mechanisms for words based languages (== all)
\r
40 abstract class AbstractWords : ILanguageWords
\r
43 /// Words and corresponding weights
\r
45 protected IDictionary<string, int> WordsWeights;
\r
47 /// Plural forms for singular words (exceptions)
\r
49 protected IDictionary<string, string> SingularToPlural = new Dictionary<string, string>();
\r
51 /// Singular froms for plural words (exceptions)
\r
53 protected IDictionary<string, string> PluralToSingular = new Dictionary<string, string>();
\r
56 /// using English heuristics, convert 'dogs' to 'dog',
\r
57 /// 'categories' to 'category',
\r
58 /// 'cat' remains unchanged.
\r
60 /// <param name="plural"></param>
\r
61 /// <returns></returns>
\r
62 public virtual string Singularize(string plural)
\r
65 if (PluralToSingular.TryGetValue(plural, out singular))
\r
67 return ComputeSingular(plural);
\r
71 /// using English heuristics, convert 'dog' to 'dogs',
\r
72 /// 'bass' remains unchanged.
\r
74 /// <param name="singular"></param>
\r
75 /// <returns></returns>
\r
76 public virtual string Pluralize(string singular)
\r
79 if (SingularToPlural.TryGetValue(singular, out plural))
\r
81 return ComputePlural(singular);
\r
85 /// Computes the singular.
\r
87 /// <param name="plural">The plural.</param>
\r
88 /// <returns></returns>
\r
89 protected abstract string ComputeSingular(string plural);
\r
91 /// Computes the plural.
\r
93 /// <param name="singular">The singular.</param>
\r
94 /// <returns></returns>
\r
95 protected abstract string ComputePlural(string singular);
\r
98 /// Returns true if the required culture is supported
\r
100 /// <param name="cultureInfo"></param>
\r
101 /// <returns></returns>
\r
102 public abstract bool Supports(CultureInfo cultureInfo);
\r
104 /// Loads the words (operation may be slow, so it is excluded from ctor)
\r
106 public abstract void Load();
\r
109 /// Loads the specified resource name.
\r
111 /// <param name="resourceName">Name of the resource.</param>
\r
112 public virtual void Load(string resourceName)
\r
114 WordsWeights = new Dictionary<string, int>();
\r
115 var type = GetType();
\r
116 using (var resourceStream = type.Assembly.GetManifestResourceStream(type, resourceName))
\r
118 using (var resourceReader = new StreamReader(resourceStream))
\r
120 var singularPluralSeparator = new[] { "=>" };
\r
121 while (!resourceReader.EndOfStream)
\r
123 string word = resourceReader.ReadLine().Trim().ToLower();
\r
124 // comments start with a "#"
\r
125 if (word.Length == 0 || word[0] == '#')
\r
128 // starting a word with a "+" adds weight to it
\r
129 while (word.StartsWith("+"))
\r
132 word = word.Substring(1);
\r
135 var singularPlural = word.Split(singularPluralSeparator, StringSplitOptions.RemoveEmptyEntries);
\r
136 // "a => b" declares a singular => plural form
\r
137 if (singularPlural.Length > 1)
\r
139 word = singularPlural[0].Trim();
\r
140 var plural = singularPlural[1].Trim();
\r
141 SingularToPlural[word] = plural;
\r
142 PluralToSingular[plural] = word;
\r
145 if (!WordsWeights.ContainsKey(word))
\r
146 WordsWeights[word] = count;
\r
148 WordsWeights[word] += count;
\r
155 /// Gets the standard form for word (removes mixed letters, for example).
\r
156 /// The goal is to make it usable from dictionary.
\r
158 /// <param name="word">The word.</param>
\r
159 /// <returns></returns>
\r
160 protected virtual string GetStandard(string word)
\r
166 /// Gets the weight for a given word.
\r
167 /// Actually based on dictionary info.
\r
169 /// <param name="word">The word.</param>
\r
170 /// <returns></returns>
\r
171 protected int GetWeight(string word)
\r
173 if (word.Length == 1) // a letter is always 1
\r
176 WordsWeights.TryGetValue(GetStandard(word.ToLower()), out weight);
\r
181 /// Tells if the specified word exists in dictionary.
\r
183 /// <param name="word">The word.</param>
\r
184 /// <returns></returns>
\r
185 protected bool Exists(string word)
\r
187 return GetWeight(word) > 0;
\r
191 /// Context is used to speedup words recognition
\r
193 private class Context
\r
195 internal class Split
\r
197 public IList<string> Words;
\r
198 public double Note;
\r
201 public readonly IDictionary<string, Split> Splits = new Dictionary<string, Split>();
\r
205 /// Extracts words from an undistinguishable letters magma
\r
206 /// for example "shipsperunit" --> "ships" "per" "unit"
\r
208 /// <param name="text">The text.</param>
\r
209 /// <returns></returns>
\r
210 public virtual IList<string> GetWords(string text)
\r
212 //var context = new Context();
\r
213 //IList<string> words = new List<string>();
\r
214 //int lastIndex = 0;
\r
215 //for (int index = 0; index <= text.Length; index++)
\r
217 // if (index == text.Length || !char.IsLetterOrDigit(text[index]))
\r
219 // var word = text.Substring(lastIndex, index - lastIndex);
\r
220 // // if the word is empty, we skip it
\r
221 // if (!string.IsNullOrEmpty(word))
\r
222 // GetMagmaWords(word, words, context);
\r
223 // lastIndex = index + 1;
\r
227 var words = new List<string>();
\r
228 GetMagmaWords(text, words, new Context());
\r
233 /// Gets the magma words.
\r
235 /// <param name="magma">The magma.</param>
\r
236 /// <param name="words">The words.</param>
\r
237 /// <param name="context">The context.</param>
\r
238 private void GetMagmaWords(string magma, ICollection<string> words, Context context)
\r
240 foreach (var word in GetMagmaWords(magma, context))
\r
245 /// Extracts words from a "word magma" by splitting the string on every position and keep the best score.
\r
246 /// The method is recursive
\r
248 /// <param name="magma">The magma.</param>
\r
249 /// <param name="context">The context.</param>
\r
250 /// <returns></returns>
\r
251 private IList<string> GetMagmaWords(string magma, Context context)
\r
253 var foundWords = new List<string>();
\r
254 if (magma.Length == 0)
\r
255 throw new ArgumentException("magma string must not be empty");
\r
256 // initalize matching
\r
257 IList<string> bestLeft = new[] { magma };
\r
258 IList<string> bestRight = new string[0];
\r
259 double bestNote = GetNote(bestLeft);
\r
260 if (bestNote > 0) // if we have something here, it is a full word, then don't look any further
\r
261 return bestLeft; // that this may break the weight... for example toothpaste always win vs +++tooth +++paste
\r
263 for (int i = 1; i <= magma.Length - 1; i++)
\r
265 var left = magma.Substring(0, i);
\r
266 var right = magma.Substring(i);
\r
267 IList<string> leftWords, rightWords;
\r
268 double leftNote = ComputeWords(left, out leftWords, context);
\r
269 double rightNote = ComputeWords(right, out rightWords, context);
\r
270 double note = leftNote + rightNote;
\r
271 if (note >= bestNote) // >= means "longer words are better"
\r
274 bestLeft = leftWords;
\r
275 bestRight = rightWords;
\r
278 foundWords.AddRange(bestLeft);
\r
279 foundWords.AddRange(bestRight);
\r
284 /// Computes the words.
\r
286 /// <param name="magma">The magma.</param>
\r
287 /// <param name="words">The words.</param>
\r
288 /// <param name="context">The context.</param>
\r
289 /// <returns></returns>
\r
290 private double ComputeWords(string magma, out IList<string> words, Context context)
\r
292 Context.Split split;
\r
293 if (!context.Splits.TryGetValue(magma, out split))
\r
295 split = new Context.Split
\r
297 Words = GetMagmaWords(magma, context)
\r
299 split.Note = GetNote(split.Words);
\r
300 context.Splits[magma] = split;
\r
302 words = split.Words;
\r
307 /// Returns a value for a list of words, with the following rules:
\r
308 /// - fewer is better
\r
309 /// - popular is better
\r
311 /// <param name="words"></param>
\r
312 /// <returns></returns>
\r
313 public double GetNote(IList<string> words)
\r
315 if (words.Count == 0)
\r
318 double totalWeight = 0;
\r
319 foreach (string word in words)
\r
321 double weight = GetWeight(word);
\r
322 totalWeight += weight;
\r
324 double averageWeight = totalWeight / words.Count;
\r
325 return averageWeight / words.Count
\r
326 * 1000; // coz it's easier to read
\r