mcs/tools/monkeydoc/Lucene.Net/Lucene.Net/Analysis/Standard/StandardAnalyzer.cs

   1 /*
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  * http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 using System;
  19
  20 using Mono.Lucene.Net.Analysis;
  21 using Version = Mono.Lucene.Net.Util.Version;
  22
  23 namespace Mono.Lucene.Net.Analysis.Standard
  24 {
  25
  26         /// <summary> Filters {@link StandardTokenizer} with {@link StandardFilter},
  27         /// {@link LowerCaseFilter} and {@link StopFilter}, using a list of English stop
  28         /// words.
  29         ///
  30         /// <a name="version"/>
  31         /// <p/>
  32         /// You must specify the required {@link Version} compatibility when creating
  33         /// StandardAnalyzer:
  34         /// <ul>
  35         /// <li>As of 2.9, StopFilter preserves position increments</li>
  36         /// <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
  37         /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a></li>
  38         /// </ul>
  39         ///
  40         /// </summary>
  41         /// <version>  $Id: StandardAnalyzer.java 829134 2009-10-23 17:18:53Z mikemccand $
  42         /// </version>
  43         public class StandardAnalyzer : Analyzer
  44         {
  45                 private System.Collections.Hashtable stopSet;
  46
  47                 /// <summary> Specifies whether deprecated acronyms should be replaced with HOST type.
  48                 /// This is false by default to support backward compatibility.
  49                 ///
  50                 /// </summary>
  51                 /// <deprecated> this should be removed in the next release (3.0).
  52                 ///
  53                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
  54                 /// </deprecated>
  55         [Obsolete("this should be removed in the next release (3.0).")]
  56                 private bool replaceInvalidAcronym = defaultReplaceInvalidAcronym;
  57
  58                 private static bool defaultReplaceInvalidAcronym;
  59                 private bool enableStopPositionIncrements;
  60
  61                 // @deprecated
  62         [Obsolete]
  63                 private bool useDefaultStopPositionIncrements;
  64
  65                 /// <summary> </summary>
  66                 /// <returns> true if new instances of StandardTokenizer will
  67                 /// replace mischaracterized acronyms
  68                 ///
  69                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
  70                 /// </returns>
  71                 /// <deprecated> This will be removed (hardwired to true) in 3.0
  72                 /// </deprecated>
  73         [Obsolete("This will be removed (hardwired to true) in 3.0")]
  74                 public static bool GetDefaultReplaceInvalidAcronym()
  75                 {
  76                         return defaultReplaceInvalidAcronym;
  77                 }
  78
  79                 /// <summary> </summary>
  80                 /// <param name="replaceInvalidAcronym">Set to true to have new
  81                 /// instances of StandardTokenizer replace mischaracterized
  82                 /// acronyms by default.  Set to false to preserve the
  83                 /// previous (before 2.4) buggy behavior.  Alternatively,
  84                 /// set the system property
  85                 /// Mono.Lucene.Net.Analysis.Standard.StandardAnalyzer.replaceInvalidAcronym
  86                 /// to false.
  87                 ///
  88                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
  89                 /// </param>
  90                 /// <deprecated> This will be removed (hardwired to true) in 3.0
  91                 /// </deprecated>
  92         [Obsolete("This will be removed (hardwired to true) in 3.0")]
  93                 public static void  SetDefaultReplaceInvalidAcronym(bool replaceInvalidAcronym)
  94                 {
  95                         defaultReplaceInvalidAcronym = replaceInvalidAcronym;
  96                 }
  97
  98
  99                 /// <summary>An array containing some common English words that are usually not
 100                 /// useful for searching.
 101                 /// </summary>
 102                 /// <deprecated> Use {@link #STOP_WORDS_SET} instead
 103                 /// </deprecated>
 104         [Obsolete("Use STOP_WORDS_SET instead ")]
 105                 public static readonly System.String[] STOP_WORDS;
 106
 107                 /// <summary>An unmodifiable set containing some common English words that are usually not
 108                 /// useful for searching.
 109                 /// </summary>
 110                 public static readonly System.Collections.Hashtable STOP_WORDS_SET;
 111
 112                 /// <summary>Builds an analyzer with the default stop words ({@link
 113                 /// #STOP_WORDS_SET}).
 114                 /// </summary>
 115                 /// <deprecated> Use {@link #StandardAnalyzer(Version)} instead.
 116                 /// </deprecated>
 117         [Obsolete("Use StandardAnalyzer(Version) instead")]
 118                 public StandardAnalyzer():this(Version.LUCENE_24, STOP_WORDS_SET)
 119                 {
 120                 }
 121
 122                 /// <summary>Builds an analyzer with the default stop words ({@link
 123                 /// #STOP_WORDS}).
 124                 /// </summary>
 125                 /// <param name="matchVersion">Lucene version to match See {@link
 126                 /// <a href="#version">above</a>}
 127                 /// </param>
 128                 public StandardAnalyzer(Version matchVersion):this(matchVersion, STOP_WORDS_SET)
 129                 {
 130                 }
 131
 132                 /// <summary>Builds an analyzer with the given stop words.</summary>
 133                 /// <deprecated> Use {@link #StandardAnalyzer(Version, Set)}
 134                 /// instead
 135                 /// </deprecated>
 136         [Obsolete("Use StandardAnalyzer(Version, Set) instead")]
 137                 public StandardAnalyzer(System.Collections.Hashtable stopWords):this(Version.LUCENE_24, stopWords)
 138                 {
 139                 }
 140
 141                 /// <summary>Builds an analyzer with the given stop words.</summary>
 142                 /// <param name="matchVersion">Lucene version to match See {@link
 143                 /// <a href="#version">above</a>}
 144                 /// </param>
 145                 /// <param name="stopWords">stop words
 146                 /// </param>
 147                 public StandardAnalyzer(Version matchVersion, System.Collections.Hashtable stopWords)
 148                 {
 149                         stopSet = stopWords;
 150                         Init(matchVersion);
 151                 }
 152
 153                 /// <summary>Builds an analyzer with the given stop words.</summary>
 154                 /// <deprecated> Use {@link #StandardAnalyzer(Version, Set)} instead
 155                 /// </deprecated>
 156         [Obsolete("Use StandardAnalyzer(Version, Set) instead")]
 157                 public StandardAnalyzer(System.String[] stopWords):this(Version.LUCENE_24, StopFilter.MakeStopSet(stopWords))
 158                 {
 159                 }
 160
 161                 /// <summary>Builds an analyzer with the stop words from the given file.</summary>
 162                 /// <seealso cref="WordlistLoader.GetWordSet(File)">
 163                 /// </seealso>
 164                 /// <deprecated> Use {@link #StandardAnalyzer(Version, File)}
 165                 /// instead
 166                 /// </deprecated>
 167         [Obsolete("Use StandardAnalyzer(Version, File) instead")]
 168                 public StandardAnalyzer(System.IO.FileInfo stopwords):this(Version.LUCENE_24, stopwords)
 169                 {
 170                 }
 171
 172                 /// <summary>Builds an analyzer with the stop words from the given file.</summary>
 173                 /// <seealso cref="WordlistLoader.GetWordSet(File)">
 174                 /// </seealso>
 175                 /// <param name="matchVersion">Lucene version to match See {@link
 176                 /// <a href="#version">above</a>}
 177                 /// </param>
 178                 /// <param name="stopwords">File to read stop words from
 179                 /// </param>
 180                 public StandardAnalyzer(Version matchVersion, System.IO.FileInfo stopwords)
 181                 {
 182                         stopSet = WordlistLoader.GetWordSet(stopwords);
 183                         Init(matchVersion);
 184                 }
 185
 186                 /// <summary>Builds an analyzer with the stop words from the given reader.</summary>
 187                 /// <seealso cref="WordlistLoader.GetWordSet(Reader)">
 188                 /// </seealso>
 189                 /// <deprecated> Use {@link #StandardAnalyzer(Version, Reader)}
 190                 /// instead
 191                 /// </deprecated>
 192         [Obsolete("Use StandardAnalyzer(Version, Reader) instead")]
 193                 public StandardAnalyzer(System.IO.TextReader stopwords):this(Version.LUCENE_24, stopwords)
 194                 {
 195                 }
 196
 197                 /// <summary>Builds an analyzer with the stop words from the given reader.</summary>
 198                 /// <seealso cref="WordlistLoader.GetWordSet(Reader)">
 199                 /// </seealso>
 200                 /// <param name="matchVersion">Lucene version to match See {@link
 201                 /// <a href="#version">above</a>}
 202                 /// </param>
 203                 /// <param name="stopwords">Reader to read stop words from
 204                 /// </param>
 205                 public StandardAnalyzer(Version matchVersion, System.IO.TextReader stopwords)
 206                 {
 207                         stopSet = WordlistLoader.GetWordSet(stopwords);
 208                         Init(matchVersion);
 209                 }
 210
 211                 /// <summary> </summary>
 212                 /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
 213                 ///
 214                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
 215                 ///
 216                 /// </param>
 217                 /// <deprecated> Remove in 3.X and make true the only valid value
 218                 /// </deprecated>
 219         [Obsolete("Remove in 3.X and make true the only valid value")]
 220                 public StandardAnalyzer(bool replaceInvalidAcronym):this(Version.LUCENE_24, STOP_WORDS_SET)
 221                 {
 222                         this.replaceInvalidAcronym = replaceInvalidAcronym;
 223                         useDefaultStopPositionIncrements = true;
 224                 }
 225
 226                 /// <param name="stopwords">The stopwords to use
 227                 /// </param>
 228                 /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
 229                 ///
 230                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
 231                 ///
 232                 /// </param>
 233                 /// <deprecated> Remove in 3.X and make true the only valid value
 234                 /// </deprecated>
 235         [Obsolete("Remove in 3.X and make true the only valid value")]
 236                 public StandardAnalyzer(System.IO.TextReader stopwords, bool replaceInvalidAcronym):this(Version.LUCENE_24, stopwords)
 237                 {
 238                         this.replaceInvalidAcronym = replaceInvalidAcronym;
 239                 }
 240
 241                 /// <param name="stopwords">The stopwords to use
 242                 /// </param>
 243                 /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
 244                 ///
 245                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
 246                 ///
 247                 /// </param>
 248                 /// <deprecated> Remove in 3.X and make true the only valid value
 249                 /// </deprecated>
 250         [Obsolete("Remove in 3.X and make true the only valid value")]
 251                 public StandardAnalyzer(System.IO.FileInfo stopwords, bool replaceInvalidAcronym):this(Version.LUCENE_24, stopwords)
 252                 {
 253                         this.replaceInvalidAcronym = replaceInvalidAcronym;
 254                 }
 255
 256                 /// <summary> </summary>
 257                 /// <param name="stopwords">The stopwords to use
 258                 /// </param>
 259                 /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
 260                 ///
 261                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
 262                 ///
 263                 /// </param>
 264                 /// <deprecated> Remove in 3.X and make true the only valid value
 265                 /// </deprecated>
 266         [Obsolete("Remove in 3.X and make true the only valid value")]
 267                 public StandardAnalyzer(System.String[] stopwords, bool replaceInvalidAcronym):this(Version.LUCENE_24, StopFilter.MakeStopSet(stopwords))
 268                 {
 269                         this.replaceInvalidAcronym = replaceInvalidAcronym;
 270                 }
 271
 272                 /// <param name="stopwords">The stopwords to use
 273                 /// </param>
 274                 /// <param name="replaceInvalidAcronym">Set to true if this analyzer should replace mischaracterized acronyms in the StandardTokenizer
 275                 ///
 276                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
 277                 ///
 278                 /// </param>
 279                 /// <deprecated> Remove in 3.X and make true the only valid value
 280                 /// </deprecated>
 281         [Obsolete("Remove in 3.X and make true the only valid value")]
 282                 public StandardAnalyzer(System.Collections.Hashtable stopwords, bool replaceInvalidAcronym):this(Version.LUCENE_24, stopwords)
 283                 {
 284                         this.replaceInvalidAcronym = replaceInvalidAcronym;
 285                 }
 286
 287                 private void  Init(Version matchVersion)
 288                 {
 289                         SetOverridesTokenStreamMethod(typeof(StandardAnalyzer));
 290                         if (matchVersion.OnOrAfter(Version.LUCENE_29))
 291                         {
 292                                 enableStopPositionIncrements = true;
 293                         }
 294                         else
 295                         {
 296                                 useDefaultStopPositionIncrements = true;
 297                         }
 298                         if (matchVersion.OnOrAfter(Version.LUCENE_24))
 299                         {
 300                                 replaceInvalidAcronym = defaultReplaceInvalidAcronym;
 301                         }
 302                         else
 303                         {
 304                                 replaceInvalidAcronym = false;
 305                         }
 306                 }
 307
 308                 /// <summary>Constructs a {@link StandardTokenizer} filtered by a {@link
 309                 /// StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}.
 310                 /// </summary>
 311                 public override TokenStream TokenStream(System.String fieldName, System.IO.TextReader reader)
 312                 {
 313                         StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);
 314                         tokenStream.SetMaxTokenLength(maxTokenLength);
 315                         TokenStream result = new StandardFilter(tokenStream);
 316                         result = new LowerCaseFilter(result);
 317                         if (useDefaultStopPositionIncrements)
 318                         {
 319                                 result = new StopFilter(result, stopSet);
 320                         }
 321                         else
 322                         {
 323                                 result = new StopFilter(enableStopPositionIncrements, result, stopSet);
 324                         }
 325                         return result;
 326                 }
 327
 328                 private sealed class SavedStreams
 329                 {
 330                         internal StandardTokenizer tokenStream;
 331                         internal TokenStream filteredTokenStream;
 332                 }
 333
 334                 /// <summary>Default maximum allowed token length </summary>
 335                 public const int DEFAULT_MAX_TOKEN_LENGTH = 255;
 336
 337                 private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
 338
 339                 /// <summary> Set maximum allowed token length.  If a token is seen
 340                 /// that exceeds this length then it is discarded.  This
 341                 /// setting only takes effect the next time tokenStream or
 342                 /// reusableTokenStream is called.
 343                 /// </summary>
 344                 public virtual void  SetMaxTokenLength(int length)
 345                 {
 346                         maxTokenLength = length;
 347                 }
 348
 349                 /// <seealso cref="setMaxTokenLength">
 350                 /// </seealso>
 351                 public virtual int GetMaxTokenLength()
 352                 {
 353                         return maxTokenLength;
 354                 }
 355
 356                 /// <deprecated> Use {@link #tokenStream} instead
 357                 /// </deprecated>
 358         [Obsolete("Use TokenStream instead")]
 359                 public override TokenStream ReusableTokenStream(System.String fieldName, System.IO.TextReader reader)
 360                 {
 361                         if (overridesTokenStreamMethod)
 362                         {
 363                                 // LUCENE-1678: force fallback to tokenStream() if we
 364                                 // have been subclassed and that subclass overrides
 365                                 // tokenStream but not reusableTokenStream
 366                                 return TokenStream(fieldName, reader);
 367                         }
 368                         SavedStreams streams = (SavedStreams) GetPreviousTokenStream();
 369                         if (streams == null)
 370                         {
 371                                 streams = new SavedStreams();
 372                                 SetPreviousTokenStream(streams);
 373                                 streams.tokenStream = new StandardTokenizer(reader);
 374                                 streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
 375                                 streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
 376                                 if (useDefaultStopPositionIncrements)
 377                                 {
 378                                         streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopSet);
 379                                 }
 380                                 else
 381                                 {
 382                                         streams.filteredTokenStream = new StopFilter(enableStopPositionIncrements, streams.filteredTokenStream, stopSet);
 383                                 }
 384                         }
 385                         else
 386                         {
 387                                 streams.tokenStream.Reset(reader);
 388                         }
 389                         streams.tokenStream.SetMaxTokenLength(maxTokenLength);
 390
 391                         streams.tokenStream.SetReplaceInvalidAcronym(replaceInvalidAcronym);
 392
 393                         return streams.filteredTokenStream;
 394                 }
 395
 396                 /// <summary> </summary>
 397                 /// <returns> true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
 398                 ///
 399                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
 400                 /// </returns>
 401                 /// <deprecated> This will be removed (hardwired to true) in 3.0
 402                 /// </deprecated>
 403         [Obsolete("This will be removed (hardwired to true) in 3.0")]
 404                 public virtual bool IsReplaceInvalidAcronym()
 405                 {
 406                         return replaceInvalidAcronym;
 407                 }
 408
 409                 /// <summary> </summary>
 410                 /// <param name="replaceInvalidAcronym">Set to true if this Analyzer is replacing mischaracterized acronyms in the StandardTokenizer
 411                 ///
 412                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
 413                 /// </param>
 414                 /// <deprecated> This will be removed (hardwired to true) in 3.0
 415                 /// </deprecated>
 416         [Obsolete("This will be removed (hardwired to true) in 3.0")]
 417                 public virtual void  SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
 418                 {
 419                         this.replaceInvalidAcronym = replaceInvalidAcronym;
 420                 }
 421                 static StandardAnalyzer()
 422                 {
 423                         // Default to true (fixed the bug), unless the system prop is set
 424                         {
 425                                 System.String v = SupportClass.AppSettings.Get("Mono.Lucene.Net.Analysis.Standard.StandardAnalyzer.replaceInvalidAcronym", "true");
 426                                 if (v == null || v.Equals("true"))
 427                                         defaultReplaceInvalidAcronym = true;
 428                                 else
 429                                         defaultReplaceInvalidAcronym = false;
 430                         }
 431                         STOP_WORDS = StopAnalyzer.ENGLISH_STOP_WORDS;
 432                         STOP_WORDS_SET = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
 433                 }
 434         }
 435 }