mcs/tools/monkeydoc/Lucene.Net/Lucene.Net/Analysis/Standard/StandardTokenizer.cs

   1 /*
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  * http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 using System;
  19
  20 using CharReader = Mono.Lucene.Net.Analysis.CharReader;
  21 using Token = Mono.Lucene.Net.Analysis.Token;
  22 using Tokenizer = Mono.Lucene.Net.Analysis.Tokenizer;
  23 using OffsetAttribute = Mono.Lucene.Net.Analysis.Tokenattributes.OffsetAttribute;
  24 using PositionIncrementAttribute = Mono.Lucene.Net.Analysis.Tokenattributes.PositionIncrementAttribute;
  25 using TermAttribute = Mono.Lucene.Net.Analysis.Tokenattributes.TermAttribute;
  26 using TypeAttribute = Mono.Lucene.Net.Analysis.Tokenattributes.TypeAttribute;
  27 using AttributeSource = Mono.Lucene.Net.Util.AttributeSource;
  28 using Version = Mono.Lucene.Net.Util.Version;
  29
  30 namespace Mono.Lucene.Net.Analysis.Standard
  31 {
  32
  33         /// <summary>A grammar-based tokenizer constructed with JFlex
  34         ///
  35         /// <p/> This should be a good tokenizer for most European-language documents:
  36         ///
  37         /// <ul>
  38         /// <li>Splits words at punctuation characters, removing punctuation. However, a
  39         /// dot that's not followed by whitespace is considered part of a token.</li>
  40         /// <li>Splits words at hyphens, unless there's a number in the token, in which case
  41         /// the whole token is interpreted as a product number and is not split.</li>
  42         /// <li>Recognizes email addresses and internet hostnames as one token.</li>
  43         /// </ul>
  44         ///
  45         /// <p/>Many applications have specific tokenizer needs.  If this tokenizer does
  46         /// not suit your application, please consider copying this source code
  47         /// directory to your project and maintaining your own grammar-based tokenizer.
  48         ///
  49         /// <a name="version"/>
  50         /// <p/>
  51         /// You must specify the required {@link Version} compatibility when creating
  52         /// StandardAnalyzer:
  53         /// <ul>
  54         /// <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
  55         /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a></li>
  56         /// </ul>
  57         /// </summary>
  58
  59         public class StandardTokenizer:Tokenizer
  60         {
  61                 private void  InitBlock()
  62                 {
  63                         maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
  64                 }
  65                 /// <summary>A private instance of the JFlex-constructed scanner </summary>
  66                 private StandardTokenizerImpl scanner;
  67
  68                 public const int ALPHANUM = 0;
  69                 public const int APOSTROPHE = 1;
  70                 public const int ACRONYM = 2;
  71                 public const int COMPANY = 3;
  72                 public const int EMAIL = 4;
  73                 public const int HOST = 5;
  74                 public const int NUM = 6;
  75                 public const int CJ = 7;
  76
  77                 /// <deprecated> this solves a bug where HOSTs that end with '.' are identified
  78                 /// as ACRONYMs. It is deprecated and will be removed in the next
  79                 /// release.
  80                 /// </deprecated>
  81         [Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs. It is deprecated and will be removed in the next release.")]
  82                 public const int ACRONYM_DEP = 8;
  83
  84                 /// <summary>String token types that correspond to token type int constants </summary>
  85                 public static readonly System.String[] TOKEN_TYPES = new System.String[]{"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>"};
  86
  87                 /// <deprecated> Please use {@link #TOKEN_TYPES} instead
  88                 /// </deprecated>
  89         [Obsolete("Please use TOKEN_TYPES instead")]
  90                 public static readonly System.String[] tokenImage = TOKEN_TYPES;
  91
  92                 /// <summary> Specifies whether deprecated acronyms should be replaced with HOST type.
  93                 /// This is false by default to support backward compatibility.
  94                 /// <p/>
  95                 /// See http://issues.apache.org/jira/browse/LUCENE-1068
  96                 ///
  97                 /// </summary>
  98                 /// <deprecated> this should be removed in the next release (3.0).
  99                 /// </deprecated>
 100         [Obsolete("this should be removed in the next release (3.0).")]
 101                 private bool replaceInvalidAcronym;
 102
 103                 private int maxTokenLength;
 104
 105                 /// <summary>Set the max allowed token length.  Any token longer
 106                 /// than this is skipped.
 107                 /// </summary>
 108                 public virtual void  SetMaxTokenLength(int length)
 109                 {
 110                         this.maxTokenLength = length;
 111                 }
 112
 113                 /// <seealso cref="setMaxTokenLength">
 114                 /// </seealso>
 115                 public virtual int GetMaxTokenLength()
 116                 {
 117                         return maxTokenLength;
 118                 }
 119
 120                 /// <summary> Creates a new instance of the {@link StandardTokenizer}. Attaches the
 121                 /// <code>input</code> to a newly created JFlex scanner.
 122                 /// </summary>
 123                 /// <deprecated> Use {@link #StandardTokenizer(Version, Reader)} instead
 124                 /// </deprecated>
 125         [Obsolete("Use StandardTokenizer(Version, Reader) instead")]
 126                 public StandardTokenizer(System.IO.TextReader input):this(Version.LUCENE_24, input)
 127                 {
 128                 }
 129
 130                 /// <summary> Creates a new instance of the {@link Mono.Lucene.Net.Analysis.Standard.StandardTokenizer}.  Attaches
 131                 /// the <code>input</code> to the newly created JFlex scanner.
 132                 ///
 133                 /// </summary>
 134                 /// <param name="input">The input reader
 135                 /// </param>
 136                 /// <param name="replaceInvalidAcronym">Set to true to replace mischaracterized acronyms with HOST.
 137                 ///
 138                 /// See http://issues.apache.org/jira/browse/LUCENE-1068
 139                 /// </param>
 140                 /// <deprecated> Use {@link #StandardTokenizer(Version, Reader)} instead
 141                 /// </deprecated>
 142         [Obsolete("Use StandardTokenizer(Version, Reader) instead")]
 143                 public StandardTokenizer(System.IO.TextReader input, bool replaceInvalidAcronym):base()
 144                 {
 145                         InitBlock();
 146                         this.scanner = new StandardTokenizerImpl(input);
 147                         Init(input, replaceInvalidAcronym);
 148                 }
 149
 150                 /// <summary> Creates a new instance of the
 151                 /// {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
 152                 /// the <code>input</code> to the newly created JFlex scanner.
 153                 ///
 154                 /// </summary>
 155                 /// <param name="input">The input reader
 156                 ///
 157                 /// See http://issues.apache.org/jira/browse/LUCENE-1068
 158                 /// </param>
 159                 public StandardTokenizer(Version matchVersion, System.IO.TextReader input):base()
 160                 {
 161                         InitBlock();
 162                         this.scanner = new StandardTokenizerImpl(input);
 163                         Init(input, matchVersion);
 164                 }
 165
 166                 /// <summary> Creates a new StandardTokenizer with a given {@link AttributeSource}. </summary>
 167                 /// <deprecated> Use
 168                 /// {@link #StandardTokenizer(Version, AttributeSource, Reader)}
 169                 /// instead
 170                 /// </deprecated>
 171         [Obsolete("Use StandardTokenizer(Version, AttributeSource, Reader) instead")]
 172                 public StandardTokenizer(AttributeSource source, System.IO.TextReader input, bool replaceInvalidAcronym):base(source)
 173                 {
 174                         InitBlock();
 175                         this.scanner = new StandardTokenizerImpl(input);
 176                         Init(input, replaceInvalidAcronym);
 177                 }
 178
 179                 /// <summary> Creates a new StandardTokenizer with a given {@link AttributeSource}.</summary>
 180                 public StandardTokenizer(Version matchVersion, AttributeSource source, System.IO.TextReader input):base(source)
 181                 {
 182                         InitBlock();
 183                         this.scanner = new StandardTokenizerImpl(input);
 184                         Init(input, matchVersion);
 185                 }
 186
 187                 /// <summary> Creates a new StandardTokenizer with a given {@link Mono.Lucene.Net.Util.AttributeSource.AttributeFactory} </summary>
 188                 /// <deprecated> Use
 189                 /// {@link #StandardTokenizer(Version, org.apache.lucene.util.AttributeSource.AttributeFactory, Reader)}
 190                 /// instead
 191                 /// </deprecated>
 192         [Obsolete("Use StandardTokenizer(Version, Mono.Lucene.Net.Util.AttributeSource.AttributeFactory, Reader) instead")]
 193                 public StandardTokenizer(AttributeFactory factory, System.IO.TextReader input, bool replaceInvalidAcronym):base(factory)
 194                 {
 195                         InitBlock();
 196                         this.scanner = new StandardTokenizerImpl(input);
 197                         Init(input, replaceInvalidAcronym);
 198                 }
 199
 200                 /// <summary> Creates a new StandardTokenizer with a given
 201                 /// {@link org.apache.lucene.util.AttributeSource.AttributeFactory}
 202                 /// </summary>
 203                 public StandardTokenizer(Version matchVersion, AttributeFactory factory, System.IO.TextReader input):base(factory)
 204                 {
 205                         InitBlock();
 206                         this.scanner = new StandardTokenizerImpl(input);
 207                         Init(input, matchVersion);
 208                 }
 209
 210                 private void  Init(System.IO.TextReader input, bool replaceInvalidAcronym)
 211                 {
 212                         this.replaceInvalidAcronym = replaceInvalidAcronym;
 213                         this.input = input;
 214                         termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
 215                         offsetAtt = (OffsetAttribute) AddAttribute(typeof(OffsetAttribute));
 216                         posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof(PositionIncrementAttribute));
 217                         typeAtt = (TypeAttribute) AddAttribute(typeof(TypeAttribute));
 218                 }
 219
 220                 private void  Init(System.IO.TextReader input, Version matchVersion)
 221                 {
 222                         if (matchVersion.OnOrAfter(Version.LUCENE_24))
 223                         {
 224                                 Init(input, true);
 225                         }
 226                         else
 227                         {
 228                                 Init(input, false);
 229                         }
 230                 }
 231
 232                 // this tokenizer generates three attributes:
 233                 // offset, positionIncrement and type
 234                 private TermAttribute termAtt;
 235                 private OffsetAttribute offsetAtt;
 236                 private PositionIncrementAttribute posIncrAtt;
 237                 private TypeAttribute typeAtt;
 238
 239                 /*
 240                 * (non-Javadoc)
 241                 *
 242                 * @see Mono.Lucene.Net.Analysis.TokenStream#next()
 243                 */
 244                 public override bool IncrementToken()
 245                 {
 246                         ClearAttributes();
 247                         int posIncr = 1;
 248
 249                         while (true)
 250                         {
 251                                 int tokenType = scanner.GetNextToken();
 252
 253                                 if (tokenType == StandardTokenizerImpl.YYEOF)
 254                                 {
 255                                         return false;
 256                                 }
 257
 258                                 if (scanner.Yylength() <= maxTokenLength)
 259                                 {
 260                                         posIncrAtt.SetPositionIncrement(posIncr);
 261                                         scanner.GetText(termAtt);
 262                                         int start = scanner.Yychar();
 263                                         offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength()));
 264                                         // This 'if' should be removed in the next release. For now, it converts
 265                                         // invalid acronyms to HOST. When removed, only the 'else' part should
 266                                         // remain.
 267                                         if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
 268                                         {
 269                                                 if (replaceInvalidAcronym)
 270                                                 {
 271                                                         typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
 272                                                         termAtt.SetTermLength(termAtt.TermLength() - 1); // remove extra '.'
 273                                                 }
 274                                                 else
 275                                                 {
 276                                                         typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
 277                                                 }
 278                                         }
 279                                         else
 280                                         {
 281                                                 typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
 282                                         }
 283                                         return true;
 284                                 }
 285                                 // When we skip a too-long term, we still increment the
 286                                 // position increment
 287                                 else
 288                                         posIncr++;
 289                         }
 290                 }
 291
 292                 public override void  End()
 293                 {
 294                         // set final offset
 295                         int finalOffset = CorrectOffset(scanner.Yychar() + scanner.Yylength());
 296                         offsetAtt.SetOffset(finalOffset, finalOffset);
 297                 }
 298
 299                 /// <deprecated> Will be removed in Lucene 3.0. This method is final, as it should
 300                 /// not be overridden. Delegates to the backwards compatibility layer.
 301                 /// </deprecated>
 302         [Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer. ")]
 303                 public override Token Next(Token reusableToken)
 304                 {
 305                         return base.Next(reusableToken);
 306                 }
 307
 308                 /// <deprecated> Will be removed in Lucene 3.0. This method is final, as it should
 309                 /// not be overridden. Delegates to the backwards compatibility layer.
 310                 /// </deprecated>
 311         [Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer. ")]
 312                 public override Token Next()
 313                 {
 314                         return base.Next();
 315                 }
 316
 317
 318                 public override void  Reset(System.IO.TextReader reader)
 319                 {
 320                         base.Reset(reader);
 321                         scanner.Reset(reader);
 322                 }
 323
 324                 /// <summary> Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com
 325                 /// when they should have been labeled as hosts instead.
 326                 /// </summary>
 327                 /// <returns> true if StandardTokenizer now returns these tokens as Hosts, otherwise false
 328                 ///
 329                 /// </returns>
 330                 /// <deprecated> Remove in 3.X and make true the only valid value
 331                 /// </deprecated>
 332         [Obsolete("Remove in 3.X and make true the only valid value")]
 333                 public virtual bool IsReplaceInvalidAcronym()
 334                 {
 335                         return replaceInvalidAcronym;
 336                 }
 337
 338                 /// <summary> </summary>
 339                 /// <param name="replaceInvalidAcronym">Set to true to replace mischaracterized acronyms as HOST.
 340                 /// </param>
 341                 /// <deprecated> Remove in 3.X and make true the only valid value
 342                 ///
 343                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
 344                 /// </deprecated>
 345         [Obsolete("Remove in 3.X and make true the only valid value. See https://issues.apache.org/jira/browse/LUCENE-1068")]
 346                 public virtual void  SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
 347                 {
 348                         this.replaceInvalidAcronym = replaceInvalidAcronym;
 349                 }
 350         }
 351 }