Merge pull request #409 from Alkarex/patch-1
[mono.git] / mcs / tools / monkeydoc / Lucene.Net / Lucene.Net / Analysis / Standard / StandardAnalyzer.cs
1 /* 
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  * 
9  * http://www.apache.org/licenses/LICENSE-2.0
10  * 
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17
18 using System;
19
20 using Mono.Lucene.Net.Analysis;
21 using Version = Mono.Lucene.Net.Util.Version;
22
23 namespace Mono.Lucene.Net.Analysis.Standard
24 {
25         
26         /// <summary> Filters {@link StandardTokenizer} with {@link StandardFilter},
27         /// {@link LowerCaseFilter} and {@link StopFilter}, using a list of English stop
28         /// words.
29         /// 
30         /// <a name="version"/>
31         /// <p/>
32         /// You must specify the required {@link Version} compatibility when creating
33         /// StandardAnalyzer:
34         /// <ul>
35         /// <li>As of 2.9, StopFilter preserves position increments</li>
36         /// <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
37         /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a></li>
38         /// </ul>
39         /// 
40         /// </summary>
41         /// <version>  $Id: StandardAnalyzer.java 829134 2009-10-23 17:18:53Z mikemccand $
42         /// </version>
43         public class StandardAnalyzer : Analyzer
44         {
45                 private System.Collections.Hashtable stopSet;
46                 
47                 /// <summary> Specifies whether deprecated acronyms should be replaced with HOST type.
48                 /// This is false by default to support backward compatibility.
49                 /// 
50                 /// </summary>
51                 /// <deprecated> this should be removed in the next release (3.0).
52                 /// 
53                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
54                 /// </deprecated>
55         [Obsolete("this should be removed in the next release (3.0).")]
56                 private bool replaceInvalidAcronym = defaultReplaceInvalidAcronym;
57                 
58                 private static bool defaultReplaceInvalidAcronym;
59                 private bool enableStopPositionIncrements;
60                 
61                 // @deprecated
62         [Obsolete]
63                 private bool useDefaultStopPositionIncrements;
64                 
65                 /// <summary> </summary>
66                 /// <returns> true if new instances of StandardTokenizer will
67                 /// replace mischaracterized acronyms
68                 /// 
69                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
70                 /// </returns>
71                 /// <deprecated> This will be removed (hardwired to true) in 3.0
72                 /// </deprecated>
73         [Obsolete("This will be removed (hardwired to true) in 3.0")]
74                 public static bool GetDefaultReplaceInvalidAcronym()
75                 {
76                         return defaultReplaceInvalidAcronym;
77                 }
78                 
79                 /// <summary> </summary>
80                 /// <param name="replaceInvalidAcronym">Set to true to have new
81                 /// instances of StandardTokenizer replace mischaracterized
82                 /// acronyms by default.  Set to false to preserve the
83                 /// previous (before 2.4) buggy behavior.  Alternatively,
84                 /// set the system property
85                 /// Mono.Lucene.Net.Analysis.Standard.StandardAnalyzer.replaceInvalidAcronym
86                 /// to false.
87                 /// 
88                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
89                 /// </param>
90                 /// <deprecated> This will be removed (hardwired to true) in 3.0
91                 /// </deprecated>
92         [Obsolete("This will be removed (hardwired to true) in 3.0")]
93                 public static void  SetDefaultReplaceInvalidAcronym(bool replaceInvalidAcronym)
94                 {
95                         defaultReplaceInvalidAcronym = replaceInvalidAcronym;
96                 }
97                 
98                 
99                 /// <summary>An array containing some common English words that are usually not
100                 /// useful for searching. 
101                 /// </summary>
102                 /// <deprecated> Use {@link #STOP_WORDS_SET} instead 
103                 /// </deprecated>
104         [Obsolete("Use STOP_WORDS_SET instead ")]
105                 public static readonly System.String[] STOP_WORDS;
106                 
107                 /// <summary>An unmodifiable set containing some common English words that are usually not
108                 /// useful for searching. 
109                 /// </summary>
110                 public static readonly System.Collections.Hashtable STOP_WORDS_SET;
111                 
112                 /// <summary>Builds an analyzer with the default stop words ({@link
113                 /// #STOP_WORDS_SET}).
114                 /// </summary>
115                 /// <deprecated> Use {@link #StandardAnalyzer(Version)} instead. 
116                 /// </deprecated>
117         [Obsolete("Use StandardAnalyzer(Version) instead")]
118                 public StandardAnalyzer():this(Version.LUCENE_24, STOP_WORDS_SET)
119                 {
120                 }
121                 
122                 /// <summary>Builds an analyzer with the default stop words ({@link
123                 /// #STOP_WORDS}).
124                 /// </summary>
125                 /// <param name="matchVersion">Lucene version to match See {@link
126                 /// <a href="#version">above</a>}
127                 /// </param>
128                 public StandardAnalyzer(Version matchVersion):this(matchVersion, STOP_WORDS_SET)
129                 {
130                 }
131                 
132                 /// <summary>Builds an analyzer with the given stop words.</summary>
133                 /// <deprecated> Use {@link #StandardAnalyzer(Version, Set)}
134                 /// instead 
135                 /// </deprecated>
136         [Obsolete("Use StandardAnalyzer(Version, Set) instead")]
137                 public StandardAnalyzer(System.Collections.Hashtable stopWords):this(Version.LUCENE_24, stopWords)
138                 {
139                 }
140                 
141                 /// <summary>Builds an analyzer with the given stop words.</summary>
142                 /// <param name="matchVersion">Lucene version to match See {@link
143                 /// <a href="#version">above</a>}
144                 /// </param>
145                 /// <param name="stopWords">stop words 
146                 /// </param>
147                 public StandardAnalyzer(Version matchVersion, System.Collections.Hashtable stopWords)
148                 {
149                         stopSet = stopWords;
150                         Init(matchVersion);
151                 }
152                 
153                 /// <summary>Builds an analyzer with the given stop words.</summary>
154                 /// <deprecated> Use {@link #StandardAnalyzer(Version, Set)} instead 
155                 /// </deprecated>
156         [Obsolete("Use StandardAnalyzer(Version, Set) instead")]
157                 public StandardAnalyzer(System.String[] stopWords):this(Version.LUCENE_24, StopFilter.MakeStopSet(stopWords))
158                 {
159                 }
160                 
161                 /// <summary>Builds an analyzer with the stop words from the given file.</summary>
162                 /// <seealso cref="WordlistLoader.GetWordSet(File)">
163                 /// </seealso>
164                 /// <deprecated> Use {@link #StandardAnalyzer(Version, File)}
165                 /// instead
166                 /// </deprecated>
167         [Obsolete("Use StandardAnalyzer(Version, File) instead")]
168                 public StandardAnalyzer(System.IO.FileInfo stopwords):this(Version.LUCENE_24, stopwords)
169                 {
170                 }
171                 
172                 /// <summary>Builds an analyzer with the stop words from the given file.</summary>
173                 /// <seealso cref="WordlistLoader.GetWordSet(File)">
174                 /// </seealso>
175                 /// <param name="matchVersion">Lucene version to match See {@link
176                 /// <a href="#version">above</a>}
177                 /// </param>
178                 /// <param name="stopwords">File to read stop words from 
179                 /// </param>
180                 public StandardAnalyzer(Version matchVersion, System.IO.FileInfo stopwords)
181                 {
182                         stopSet = WordlistLoader.GetWordSet(stopwords);
183                         Init(matchVersion);
184                 }
185                 
186                 /// <summary>Builds an analyzer with the stop words from the given reader.</summary>
187                 /// <seealso cref="WordlistLoader.GetWordSet(Reader)">
188                 /// </seealso>
189                 /// <deprecated> Use {@link #StandardAnalyzer(Version, Reader)}
190                 /// instead
191                 /// </deprecated>
192         [Obsolete("Use StandardAnalyzer(Version, Reader) instead")]
193                 public StandardAnalyzer(System.IO.TextReader stopwords):this(Version.LUCENE_24, stopwords)
194                 {
195                 }
196                 
197                 /// <summary>Builds an analyzer with the stop words from the given reader.</summary>
198                 /// <seealso cref="WordlistLoader.GetWordSet(Reader)">
199                 /// </seealso>
200                 /// <param name="matchVersion">Lucene version to match See {@link
201                 /// <a href="#version">above</a>}
202                 /// </param>
203                 /// <param name="stopwords">Reader to read stop words from 
204                 /// </param>
205                 public StandardAnalyzer(Version matchVersion, System.IO.TextReader stopwords)
206                 {
207                         stopSet = WordlistLoader.GetWordSet(stopwords);
208                         Init(matchVersion);
209                 }
210                 
211                 /// <summary> </summary>
212                 /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
213                 /// 
214                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
215                 /// 
216                 /// </param>
217                 /// <deprecated> Remove in 3.X and make true the only valid value
218                 /// </deprecated>
219         [Obsolete("Remove in 3.X and make true the only valid value")]
220                 public StandardAnalyzer(bool replaceInvalidAcronym):this(Version.LUCENE_24, STOP_WORDS_SET)
221                 {
222                         this.replaceInvalidAcronym = replaceInvalidAcronym;
223                         useDefaultStopPositionIncrements = true;
224                 }
225                 
226                 /// <param name="stopwords">The stopwords to use
227                 /// </param>
228                 /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
229                 /// 
230                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
231                 /// 
232                 /// </param>
233                 /// <deprecated> Remove in 3.X and make true the only valid value
234                 /// </deprecated>
235         [Obsolete("Remove in 3.X and make true the only valid value")]
236                 public StandardAnalyzer(System.IO.TextReader stopwords, bool replaceInvalidAcronym):this(Version.LUCENE_24, stopwords)
237                 {
238                         this.replaceInvalidAcronym = replaceInvalidAcronym;
239                 }
240                 
241                 /// <param name="stopwords">The stopwords to use
242                 /// </param>
243                 /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
244                 /// 
245                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
246                 /// 
247                 /// </param>
248                 /// <deprecated> Remove in 3.X and make true the only valid value
249                 /// </deprecated>
250         [Obsolete("Remove in 3.X and make true the only valid value")]
251                 public StandardAnalyzer(System.IO.FileInfo stopwords, bool replaceInvalidAcronym):this(Version.LUCENE_24, stopwords)
252                 {
253                         this.replaceInvalidAcronym = replaceInvalidAcronym;
254                 }
255                 
256                 /// <summary> </summary>
257                 /// <param name="stopwords">The stopwords to use
258                 /// </param>
259                 /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
260                 /// 
261                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
262                 /// 
263                 /// </param>
264                 /// <deprecated> Remove in 3.X and make true the only valid value
265                 /// </deprecated>
266         [Obsolete("Remove in 3.X and make true the only valid value")]
267                 public StandardAnalyzer(System.String[] stopwords, bool replaceInvalidAcronym):this(Version.LUCENE_24, StopFilter.MakeStopSet(stopwords))
268                 {
269                         this.replaceInvalidAcronym = replaceInvalidAcronym;
270                 }
271                 
272                 /// <param name="stopwords">The stopwords to use
273                 /// </param>
274                 /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
275                 /// 
276                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
277                 /// 
278                 /// </param>
279                 /// <deprecated> Remove in 3.X and make true the only valid value
280                 /// </deprecated>
281         [Obsolete("Remove in 3.X and make true the only valid value")]
282                 public StandardAnalyzer(System.Collections.Hashtable stopwords, bool replaceInvalidAcronym):this(Version.LUCENE_24, stopwords)
283                 {
284                         this.replaceInvalidAcronym = replaceInvalidAcronym;
285                 }
286                 
287                 private void  Init(Version matchVersion)
288                 {
289                         SetOverridesTokenStreamMethod(typeof(StandardAnalyzer));
290                         if (matchVersion.OnOrAfter(Version.LUCENE_29))
291                         {
292                                 enableStopPositionIncrements = true;
293                         }
294                         else
295                         {
296                                 useDefaultStopPositionIncrements = true;
297                         }
298                         if (matchVersion.OnOrAfter(Version.LUCENE_24))
299                         {
300                                 replaceInvalidAcronym = defaultReplaceInvalidAcronym;
301                         }
302                         else
303                         {
304                                 replaceInvalidAcronym = false;
305                         }
306                 }
307                 
308                 /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
309                 /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. 
310                 /// </summary>
311                 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
312                 {
313                         StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);
314                         tokenStream.SetMaxTokenLength(maxTokenLength);
315                         TokenStream result = new StandardFilter(tokenStream);
316                         result = new LowerCaseFilter(result);
317                         if (useDefaultStopPositionIncrements)
318                         {
319                                 result = new StopFilter(result, stopSet);
320                         }
321                         else
322                         {
323                                 result = new StopFilter(enableStopPositionIncrements, result, stopSet);
324                         }
325                         return result;
326                 }
327                 
328                 private sealed class SavedStreams
329                 {
330                         internal StandardTokenizer tokenStream;
331                         internal TokenStream filteredTokenStream;
332                 }
333                 
334                 /// <summary>Default maximum allowed token length </summary>
335                 public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
336                 
337                 private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
338                 
339                 /// <summary> Set maximum allowed token length.  If a token is seen
340                 /// that exceeds this length then it is discarded.  This
341                 /// setting only takes effect the next time tokenStream or
342                 /// reusableTokenStream is called.
343                 /// </summary>
344                 public virtual void  SetMaxTokenLength(int length)
345                 {
346                         maxTokenLength = length;
347                 }
348                 
349                 /// <seealso cref="setMaxTokenLength">
350                 /// </seealso>
351                 public virtual int GetMaxTokenLength()
352                 {
353                         return maxTokenLength;
354                 }
355                 
356                 /// <deprecated> Use {@link #tokenStream} instead 
357                 /// </deprecated>
358         [Obsolete("Use TokenStream instead")]
359                 public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
360                 {
361                         if (overridesTokenStreamMethod)
362                         {
363                                 // LUCENE-1678: force fallback to tokenStream() if we
364                                 // have been subclassed and that subclass overrides
365                                 // tokenStream but not reusableTokenStream
366                                 return TokenStream(fieldName, reader);
367                         }
368                         SavedStreams streams = (SavedStreams) GetPreviousTokenStream();
369                         if (streams == null)
370                         {
371                                 streams = new SavedStreams();
372                                 SetPreviousTokenStream(streams);
373                                 streams.tokenStream = new StandardTokenizer(reader);
374                                 streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
375                                 streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
376                                 if (useDefaultStopPositionIncrements)
377                                 {
378                                         streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
379                                 }
380                                 else
381                                 {
382                                         streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements, streams.filteredTokenStream, stopSet);
383                                 }
384                         }
385                         else
386                         {
387                                 streams.tokenStream.Reset(reader);
388                         }
389                         streams.tokenStream.SetMaxTokenLength(maxTokenLength);
390                         
391                         streams.tokenStream.SetReplaceInvalidAcronym(replaceInvalidAcronym);
392                         
393                         return streams.filteredTokenStream;
394                 }
395                 
396                 /// <summary> </summary>
397                 /// <returns> true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
398                 /// 
399                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
400                 /// </returns>
401                 /// <deprecated> This will be removed (hardwired to true) in 3.0
402                 /// </deprecated>
403         [Obsolete("This will be removed (hardwired to true) in 3.0")]
404                 public virtual bool IsReplaceInvalidAcronym()
405                 {
406                         return replaceInvalidAcronym;
407                 }
408                 
409                 /// <summary> </summary>
410                 /// <param name="replaceInvalidAcronym">Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
411                 /// 
412                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
413                 /// </param>
414                 /// <deprecated> This will be removed (hardwired to true) in 3.0
415                 /// </deprecated>
416         [Obsolete("This will be removed (hardwired to true) in 3.0")]
417                 public virtual void  SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
418                 {
419                         this.replaceInvalidAcronym = replaceInvalidAcronym;
420                 }
421                 static StandardAnalyzer()
422                 {
423                         // Default to true (fixed the bug), unless the system prop is set
424                         {
425                                 System.String v = SupportClass.AppSettings.Get("Mono.Lucene.Net.Analysis.Standard.StandardAnalyzer.replaceInvalidAcronym", "true");
426                                 if (v == null || v.Equals("true"))
427                                         defaultReplaceInvalidAcronym = true;
428                                 else
429                                         defaultReplaceInvalidAcronym = false;
430                         }
431                         STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
432                         STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
433                 }
434         }
435 }