2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
20 using Mono.Lucene.Net.Analysis;
21 using Version = Mono.Lucene.Net.Util.Version;
23 namespace Mono.Lucene.Net.Analysis.Standard
26 /// <summary> Filters {@link StandardTokenizer} with {@link StandardFilter},
27 /// {@link LowerCaseFilter} and {@link StopFilter}, using a list of English stop
30 /// <a name="version"/>
32 /// You must specify the required {@link Version} compatibility when creating
35 /// <li>As of 2.9, StopFilter preserves position increments</li>
36 /// <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
37 /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a></li>
41 /// <version> $Id: StandardAnalyzer.java 829134 2009-10-23 17:18:53Z mikemccand $
43 public class StandardAnalyzer : Analyzer
45 private System.Collections.Hashtable stopSet;
47 /// <summary> Specifies whether deprecated acronyms should be replaced with HOST type.
48 /// This is false by default to support backward compatibility.
51 /// <deprecated> this should be removed in the next release (3.0).
53 /// See https://issues.apache.org/jira/browse/LUCENE-1068
55 [Obsolete("this should be removed in the next release (3.0).")]
56 private bool replaceInvalidAcronym = defaultReplaceInvalidAcronym;
58 private static bool defaultReplaceInvalidAcronym;
59 private bool enableStopPositionIncrements;
63 private bool useDefaultStopPositionIncrements;
65 /// <summary> </summary>
66 /// <returns> true if new instances of StandardTokenizer will
67 /// replace mischaracterized acronyms
69 /// See https://issues.apache.org/jira/browse/LUCENE-1068
71 /// <deprecated> This will be removed (hardwired to true) in 3.0
73 [Obsolete("This will be removed (hardwired to true) in 3.0")]
74 public static bool GetDefaultReplaceInvalidAcronym()
76 return defaultReplaceInvalidAcronym;
79 /// <summary> </summary>
80 /// <param name="replaceInvalidAcronym">Set to true to have new
81 /// instances of StandardTokenizer replace mischaracterized
82 /// acronyms by default. Set to false to preserve the
83 /// previous (before 2.4) buggy behavior. Alternatively,
84 /// set the system property
85 /// Mono.Lucene.Net.Analysis.Standard.StandardAnalyzer.replaceInvalidAcronym
88 /// See https://issues.apache.org/jira/browse/LUCENE-1068
90 /// <deprecated> This will be removed (hardwired to true) in 3.0
92 [Obsolete("This will be removed (hardwired to true) in 3.0")]
93 public static void SetDefaultReplaceInvalidAcronym(bool replaceInvalidAcronym)
95 defaultReplaceInvalidAcronym = replaceInvalidAcronym;
99 /// <summary>An array containing some common English words that are usually not
100 /// useful for searching.
102 /// <deprecated> Use {@link #STOP_WORDS_SET} instead
104 [Obsolete("Use STOP_WORDS_SET instead ")]
105 public static readonly System.String[] STOP_WORDS;
107 /// <summary>An unmodifiable set containing some common English words that are usually not
108 /// useful for searching.
110 public static readonly System.Collections.Hashtable STOP_WORDS_SET;
112 /// <summary>Builds an analyzer with the default stop words ({@link
113 /// #STOP_WORDS_SET}).
115 /// <deprecated> Use {@link #StandardAnalyzer(Version)} instead.
117 [Obsolete("Use StandardAnalyzer(Version) instead")]
118 public StandardAnalyzer():this(Version.LUCENE_24, STOP_WORDS_SET)
122 /// <summary>Builds an analyzer with the default stop words ({@link
125 /// <param name="matchVersion">Lucene version to match See {@link
126 /// <a href="#version">above</a>}
128 public StandardAnalyzer(Version matchVersion):this(matchVersion, STOP_WORDS_SET)
132 /// <summary>Builds an analyzer with the given stop words.</summary>
133 /// <deprecated> Use {@link #StandardAnalyzer(Version, Set)}
136 [Obsolete("Use StandardAnalyzer(Version, Set) instead")]
137 public StandardAnalyzer(System.Collections.Hashtable stopWords):this(Version.LUCENE_24, stopWords)
141 /// <summary>Builds an analyzer with the given stop words.</summary>
142 /// <param name="matchVersion">Lucene version to match See {@link
143 /// <a href="#version">above</a>}
145 /// <param name="stopWords">stop words
147 public StandardAnalyzer(Version matchVersion, System.Collections.Hashtable stopWords)
153 /// <summary>Builds an analyzer with the given stop words.</summary>
154 /// <deprecated> Use {@link #StandardAnalyzer(Version, Set)} instead
156 [Obsolete("Use StandardAnalyzer(Version, Set) instead")]
157 public StandardAnalyzer(System.String[] stopWords):this(Version.LUCENE_24, StopFilter.MakeStopSet(stopWords))
161 /// <summary>Builds an analyzer with the stop words from the given file.</summary>
162 /// <seealso cref="WordlistLoader.GetWordSet(File)">
164 /// <deprecated> Use {@link #StandardAnalyzer(Version, File)}
167 [Obsolete("Use StandardAnalyzer(Version, File) instead")]
168 public StandardAnalyzer(System.IO.FileInfo stopwords):this(Version.LUCENE_24, stopwords)
172 /// <summary>Builds an analyzer with the stop words from the given file.</summary>
173 /// <seealso cref="WordlistLoader.GetWordSet(File)">
175 /// <param name="matchVersion">Lucene version to match See {@link
176 /// <a href="#version">above</a>}
178 /// <param name="stopwords">File to read stop words from
180 public StandardAnalyzer(Version matchVersion, System.IO.FileInfo stopwords)
182 stopSet = WordlistLoader.GetWordSet(stopwords);
186 /// <summary>Builds an analyzer with the stop words from the given reader.</summary>
187 /// <seealso cref="WordlistLoader.GetWordSet(Reader)">
189 /// <deprecated> Use {@link #StandardAnalyzer(Version, Reader)}
192 [Obsolete("Use StandardAnalyzer(Version, Reader) instead")]
193 public StandardAnalyzer(System.IO.TextReader stopwords):this(Version.LUCENE_24, stopwords)
197 /// <summary>Builds an analyzer with the stop words from the given reader.</summary>
198 /// <seealso cref="WordlistLoader.GetWordSet(Reader)">
200 /// <param name="matchVersion">Lucene version to match See {@link
201 /// <a href="#version">above</a>}
203 /// <param name="stopwords">Reader to read stop words from
205 public StandardAnalyzer(Version matchVersion, System.IO.TextReader stopwords)
207 stopSet = WordlistLoader.GetWordSet(stopwords);
211 /// <summary> </summary>
212 /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
214 /// See https://issues.apache.org/jira/browse/LUCENE-1068
217 /// <deprecated> Remove in 3.X and make true the only valid value
219 [Obsolete("Remove in 3.X and make true the only valid value")]
220 public StandardAnalyzer(bool replaceInvalidAcronym):this(Version.LUCENE_24, STOP_WORDS_SET)
222 this.replaceInvalidAcronym = replaceInvalidAcronym;
223 useDefaultStopPositionIncrements = true;
226 /// <param name="stopwords">The stopwords to use
228 /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
230 /// See https://issues.apache.org/jira/browse/LUCENE-1068
233 /// <deprecated> Remove in 3.X and make true the only valid value
235 [Obsolete("Remove in 3.X and make true the only valid value")]
236 public StandardAnalyzer(System.IO.TextReader stopwords, bool replaceInvalidAcronym):this(Version.LUCENE_24, stopwords)
238 this.replaceInvalidAcronym = replaceInvalidAcronym;
241 /// <param name="stopwords">The stopwords to use
243 /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
245 /// See https://issues.apache.org/jira/browse/LUCENE-1068
248 /// <deprecated> Remove in 3.X and make true the only valid value
250 [Obsolete("Remove in 3.X and make true the only valid value")]
251 public StandardAnalyzer(System.IO.FileInfo stopwords, bool replaceInvalidAcronym):this(Version.LUCENE_24, stopwords)
253 this.replaceInvalidAcronym = replaceInvalidAcronym;
256 /// <summary> </summary>
257 /// <param name="stopwords">The stopwords to use
259 /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
261 /// See https://issues.apache.org/jira/browse/LUCENE-1068
264 /// <deprecated> Remove in 3.X and make true the only valid value
266 [Obsolete("Remove in 3.X and make true the only valid value")]
267 public StandardAnalyzer(System.String[] stopwords, bool replaceInvalidAcronym):this(Version.LUCENE_24, StopFilter.MakeStopSet(stopwords))
269 this.replaceInvalidAcronym = replaceInvalidAcronym;
272 /// <param name="stopwords">The stopwords to use
274 /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
276 /// See https://issues.apache.org/jira/browse/LUCENE-1068
279 /// <deprecated> Remove in 3.X and make true the only valid value
281 [Obsolete("Remove in 3.X and make true the only valid value")]
282 public StandardAnalyzer(System.Collections.Hashtable stopwords, bool replaceInvalidAcronym):this(Version.LUCENE_24, stopwords)
284 this.replaceInvalidAcronym = replaceInvalidAcronym;
287 private void Init(Version matchVersion)
289 SetOverridesTokenStreamMethod(typeof(StandardAnalyzer));
290 if (matchVersion.OnOrAfter(Version.LUCENE_29))
292 enableStopPositionIncrements = true;
296 useDefaultStopPositionIncrements = true;
298 if (matchVersion.OnOrAfter(Version.LUCENE_24))
300 replaceInvalidAcronym = defaultReplaceInvalidAcronym;
304 replaceInvalidAcronym = false;
308 /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
309 /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
311 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
313 StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);
314 tokenStream.SetMaxTokenLength(maxTokenLength);
315 TokenStream result = new StandardFilter(tokenStream);
316 result = new LowerCaseFilter(result);
317 if (useDefaultStopPositionIncrements)
319 result = new StopFilter(result, stopSet);
323 result = new StopFilter(enableStopPositionIncrements, result, stopSet);
328 private sealed class SavedStreams
330 internal StandardTokenizer tokenStream;
331 internal TokenStream filteredTokenStream;
334 /// <summary>Default maximum allowed token length </summary>
335 public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
337 private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
339 /// <summary> Set maximum allowed token length. If a token is seen
340 /// that exceeds this length then it is discarded. This
341 /// setting only takes effect the next time tokenStream or
342 /// reusableTokenStream is called.
344 public virtual void SetMaxTokenLength(int length)
346 maxTokenLength = length;
349 /// <seealso cref="setMaxTokenLength">
351 public virtual int GetMaxTokenLength()
353 return maxTokenLength;
356 /// <deprecated> Use {@link #tokenStream} instead
358 [Obsolete("Use TokenStream instead")]
359 public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
361 if (overridesTokenStreamMethod)
363 // LUCENE-1678: force fallback to tokenStream() if we
364 // have been subclassed and that subclass overrides
365 // tokenStream but not reusableTokenStream
366 return TokenStream(fieldName, reader);
368 SavedStreams streams = (SavedStreams) GetPreviousTokenStream();
371 streams = new SavedStreams();
372 SetPreviousTokenStream(streams);
373 streams.tokenStream = new StandardTokenizer(reader);
374 streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
375 streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
376 if (useDefaultStopPositionIncrements)
378 streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
382 streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements, streams.filteredTokenStream, stopSet);
387 streams.tokenStream.Reset(reader);
389 streams.tokenStream.SetMaxTokenLength(maxTokenLength);
391 streams.tokenStream.SetReplaceInvalidAcronym(replaceInvalidAcronym);
393 return streams.filteredTokenStream;
396 /// <summary> </summary>
397 /// <returns> true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
399 /// See https://issues.apache.org/jira/browse/LUCENE-1068
401 /// <deprecated> This will be removed (hardwired to true) in 3.0
403 [Obsolete("This will be removed (hardwired to true) in 3.0")]
404 public virtual bool IsReplaceInvalidAcronym()
406 return replaceInvalidAcronym;
409 /// <summary> </summary>
410 /// <param name="replaceInvalidAcronym">Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
412 /// See https://issues.apache.org/jira/browse/LUCENE-1068
414 /// <deprecated> This will be removed (hardwired to true) in 3.0
416 [Obsolete("This will be removed (hardwired to true) in 3.0")]
417 public virtual void SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
419 this.replaceInvalidAcronym = replaceInvalidAcronym;
421 static StandardAnalyzer()
423 // Default to true (fixed the bug), unless the system prop is set
425 System.String v = SupportClass.AppSettings.Get("Mono.Lucene.Net.Analysis.Standard.StandardAnalyzer.replaceInvalidAcronym", "true");
426 if (v == null || v.Equals("true"))
427 defaultReplaceInvalidAcronym = true;
429 defaultReplaceInvalidAcronym = false;
431 STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
432 STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;