2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
20 using CharReader = Mono.Lucene.Net.Analysis.CharReader;
21 using Token = Mono.Lucene.Net.Analysis.Token;
22 using Tokenizer = Mono.Lucene.Net.Analysis.Tokenizer;
23 using OffsetAttribute = Mono.Lucene.Net.Analysis.Tokenattributes.OffsetAttribute;
24 using PositionIncrementAttribute = Mono.Lucene.Net.Analysis.Tokenattributes.PositionIncrementAttribute;
25 using TermAttribute = Mono.Lucene.Net.Analysis.Tokenattributes.TermAttribute;
26 using TypeAttribute = Mono.Lucene.Net.Analysis.Tokenattributes.TypeAttribute;
27 using AttributeSource = Mono.Lucene.Net.Util.AttributeSource;
28 using Version = Mono.Lucene.Net.Util.Version;
30 namespace Mono.Lucene.Net.Analysis.Standard
33 /// <summary>A grammar-based tokenizer constructed with JFlex
35 /// <p/> This should be a good tokenizer for most European-language documents:
38 /// <li>Splits words at punctuation characters, removing punctuation. However, a
39 /// dot that's not followed by whitespace is considered part of a token.</li>
40 /// <li>Splits words at hyphens, unless there's a number in the token, in which case
41 /// the whole token is interpreted as a product number and is not split.</li>
42 /// <li>Recognizes email addresses and internet hostnames as one token.</li>
45 /// <p/>Many applications have specific tokenizer needs. If this tokenizer does
46 /// not suit your application, please consider copying this source code
47 /// directory to your project and maintaining your own grammar-based tokenizer.
49 /// <a name="version"/>
51 /// You must specify the required {@link Version} compatibility when creating
54 /// <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
55 /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a></li>
59 public class StandardTokenizer:Tokenizer
61 private void InitBlock()
63 maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
65 /// <summary>A private instance of the JFlex-constructed scanner </summary>
66 private StandardTokenizerImpl scanner;
68 public const int ALPHANUM = 0;
69 public const int APOSTROPHE = 1;
70 public const int ACRONYM = 2;
71 public const int COMPANY = 3;
72 public const int EMAIL = 4;
73 public const int HOST = 5;
74 public const int NUM = 6;
75 public const int CJ = 7;
77 /// <deprecated> this solves a bug where HOSTs that end with '.' are identified
78 /// as ACRONYMs. It is deprecated and will be removed in the next
81 [Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs. It is deprecated and will be removed in the next release.")]
82 public const int ACRONYM_DEP = 8;
84 /// <summary>String token types that correspond to token type int constants </summary>
85 public static readonly System.String[] TOKEN_TYPES = new System.String[]{"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>"};
87 /// <deprecated> Please use {@link #TOKEN_TYPES} instead
89 [Obsolete("Please use TOKEN_TYPES instead")]
90 public static readonly System.String[] tokenImage = TOKEN_TYPES;
92 /// <summary> Specifies whether deprecated acronyms should be replaced with HOST type.
93 /// This is false by default to support backward compatibility.
95 /// See http://issues.apache.org/jira/browse/LUCENE-1068
98 /// <deprecated> this should be removed in the next release (3.0).
100 [Obsolete("this should be removed in the next release (3.0).")]
101 private bool replaceInvalidAcronym;
103 private int maxTokenLength;
105 /// <summary>Set the max allowed token length. Any token longer
106 /// than this is skipped.
108 public virtual void SetMaxTokenLength(int length)
110 this.maxTokenLength = length;
113 /// <seealso cref="setMaxTokenLength">
115 public virtual int GetMaxTokenLength()
117 return maxTokenLength;
120 /// <summary> Creates a new instance of the {@link StandardTokenizer}. Attaches the
121 /// <code>input</code> to a newly created JFlex scanner.
123 /// <deprecated> Use {@link #StandardTokenizer(Version, Reader)} instead
125 [Obsolete("Use StandardTokenizer(Version, Reader) instead")]
126 public StandardTokenizer(System.IO.TextReader input):this(Version.LUCENE_24, input)
130 /// <summary> Creates a new instance of the {@link Mono.Lucene.Net.Analysis.Standard.StandardTokenizer}. Attaches
131 /// the <code>input</code> to the newly created JFlex scanner.
134 /// <param name="input">The input reader
136 /// <param name="replaceInvalidAcronym">Set to true to replace mischaracterized acronyms with HOST.
138 /// See http://issues.apache.org/jira/browse/LUCENE-1068
140 /// <deprecated> Use {@link #StandardTokenizer(Version, Reader)} instead
142 [Obsolete("Use StandardTokenizer(Version, Reader) instead")]
143 public StandardTokenizer(System.IO.TextReader input, bool replaceInvalidAcronym):base()
146 this.scanner = new StandardTokenizerImpl(input);
147 Init(input, replaceInvalidAcronym);
150 /// <summary> Creates a new instance of the
151 /// {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
152 /// the <code>input</code> to the newly created JFlex scanner.
155 /// <param name="input">The input reader
157 /// See http://issues.apache.org/jira/browse/LUCENE-1068
159 public StandardTokenizer(Version matchVersion, System.IO.TextReader input):base()
162 this.scanner = new StandardTokenizerImpl(input);
163 Init(input, matchVersion);
166 /// <summary> Creates a new StandardTokenizer with a given {@link AttributeSource}. </summary>
168 /// {@link #StandardTokenizer(Version, AttributeSource, Reader)}
171 [Obsolete("Use StandardTokenizer(Version, AttributeSource, Reader) instead")]
172 public StandardTokenizer(AttributeSource source, System.IO.TextReader input, bool replaceInvalidAcronym):base(source)
175 this.scanner = new StandardTokenizerImpl(input);
176 Init(input, replaceInvalidAcronym);
179 /// <summary> Creates a new StandardTokenizer with a given {@link AttributeSource}.</summary>
180 public StandardTokenizer(Version matchVersion, AttributeSource source, System.IO.TextReader input):base(source)
183 this.scanner = new StandardTokenizerImpl(input);
184 Init(input, matchVersion);
187 /// <summary> Creates a new StandardTokenizer with a given {@link Mono.Lucene.Net.Util.AttributeSource.AttributeFactory} </summary>
189 /// {@link #StandardTokenizer(Version, org.apache.lucene.util.AttributeSource.AttributeFactory, Reader)}
192 [Obsolete("Use StandardTokenizer(Version, Mono.Lucene.Net.Util.AttributeSource.AttributeFactory, Reader) instead")]
193 public StandardTokenizer(AttributeFactory factory, System.IO.TextReader input, bool replaceInvalidAcronym):base(factory)
196 this.scanner = new StandardTokenizerImpl(input);
197 Init(input, replaceInvalidAcronym);
200 /// <summary> Creates a new StandardTokenizer with a given
201 /// {@link org.apache.lucene.util.AttributeSource.AttributeFactory}
203 public StandardTokenizer(Version matchVersion, AttributeFactory factory, System.IO.TextReader input):base(factory)
206 this.scanner = new StandardTokenizerImpl(input);
207 Init(input, matchVersion);
210 private void Init(System.IO.TextReader input, bool replaceInvalidAcronym)
212 this.replaceInvalidAcronym = replaceInvalidAcronym;
214 termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
215 offsetAtt = (OffsetAttribute) AddAttribute(typeof(OffsetAttribute));
216 posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof(PositionIncrementAttribute));
217 typeAtt = (TypeAttribute) AddAttribute(typeof(TypeAttribute));
220 private void Init(System.IO.TextReader input, Version matchVersion)
222 if (matchVersion.OnOrAfter(Version.LUCENE_24))
232 // this tokenizer generates three attributes:
233 // offset, positionIncrement and type
234 private TermAttribute termAtt;
235 private OffsetAttribute offsetAtt;
236 private PositionIncrementAttribute posIncrAtt;
237 private TypeAttribute typeAtt;
242 * @see Mono.Lucene.Net.Analysis.TokenStream#next()
244 public override bool IncrementToken()
251 int tokenType = scanner.GetNextToken();
253 if (tokenType == StandardTokenizerImpl.YYEOF)
258 if (scanner.Yylength() <= maxTokenLength)
260 posIncrAtt.SetPositionIncrement(posIncr);
261 scanner.GetText(termAtt);
262 int start = scanner.Yychar();
263 offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength()));
264 // This 'if' should be removed in the next release. For now, it converts
265 // invalid acronyms to HOST. When removed, only the 'else' part should
267 if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
269 if (replaceInvalidAcronym)
271 typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
272 termAtt.SetTermLength(termAtt.TermLength() - 1); // remove extra '.'
276 typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
281 typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
285 // When we skip a too-long term, we still increment the
286 // position increment
292 public override void End()
295 int finalOffset = CorrectOffset(scanner.Yychar() + scanner.Yylength());
296 offsetAtt.SetOffset(finalOffset, finalOffset);
299 /// <deprecated> Will be removed in Lucene 3.0. This method is final, as it should
300 /// not be overridden. Delegates to the backwards compatibility layer.
302 [Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer. ")]
303 public override Token Next(Token reusableToken)
305 return base.Next(reusableToken);
308 /// <deprecated> Will be removed in Lucene 3.0. This method is final, as it should
309 /// not be overridden. Delegates to the backwards compatibility layer.
311 [Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer. ")]
312 public override Token Next()
318 public override void Reset(System.IO.TextReader reader)
321 scanner.Reset(reader);
324 /// <summary> Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com
325 /// when they should have been labeled as hosts instead.
327 /// <returns> true if StandardTokenizer now returns these tokens as Hosts, otherwise false
330 /// <deprecated> Remove in 3.X and make true the only valid value
332 [Obsolete("Remove in 3.X and make true the only valid value")]
333 public virtual bool IsReplaceInvalidAcronym()
335 return replaceInvalidAcronym;
338 /// <summary> </summary>
339 /// <param name="replaceInvalidAcronym">Set to true to replace mischaracterized acronyms as HOST.
341 /// <deprecated> Remove in 3.X and make true the only valid value
343 /// See https://issues.apache.org/jira/browse/LUCENE-1068
345 [Obsolete("Remove in 3.X and make true the only valid value. See https://issues.apache.org/jira/browse/LUCENE-1068")]
346 public virtual void SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
348 this.replaceInvalidAcronym = replaceInvalidAcronym;