Merge pull request #409 from Alkarex/patch-1
[mono.git] / mcs / tools / monkeydoc / Lucene.Net / Lucene.Net / Analysis / Standard / StandardTokenizer.cs
1 /* 
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  * 
9  * http://www.apache.org/licenses/LICENSE-2.0
10  * 
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17
18 using System;
19
20 using CharReader = Mono.Lucene.Net.Analysis.CharReader;
21 using Token = Mono.Lucene.Net.Analysis.Token;
22 using Tokenizer = Mono.Lucene.Net.Analysis.Tokenizer;
23 using OffsetAttribute = Mono.Lucene.Net.Analysis.Tokenattributes.OffsetAttribute;
24 using PositionIncrementAttribute = Mono.Lucene.Net.Analysis.Tokenattributes.PositionIncrementAttribute;
25 using TermAttribute = Mono.Lucene.Net.Analysis.Tokenattributes.TermAttribute;
26 using TypeAttribute = Mono.Lucene.Net.Analysis.Tokenattributes.TypeAttribute;
27 using AttributeSource = Mono.Lucene.Net.Util.AttributeSource;
28 using Version = Mono.Lucene.Net.Util.Version;
29
30 namespace Mono.Lucene.Net.Analysis.Standard
31 {
32         
33         /// <summary>A grammar-based tokenizer constructed with JFlex
34         /// 
35         /// <p/> This should be a good tokenizer for most European-language documents:
36         /// 
37         /// <ul>
38         /// <li>Splits words at punctuation characters, removing punctuation. However, a 
39         /// dot that's not followed by whitespace is considered part of a token.</li>
40         /// <li>Splits words at hyphens, unless there's a number in the token, in which case
41         /// the whole token is interpreted as a product number and is not split.</li>
42         /// <li>Recognizes email addresses and internet hostnames as one token.</li>
43         /// </ul>
44         /// 
45         /// <p/>Many applications have specific tokenizer needs.  If this tokenizer does
46         /// not suit your application, please consider copying this source code
47         /// directory to your project and maintaining your own grammar-based tokenizer.
48         /// 
49         /// <a name="version"/>
50         /// <p/>
51         /// You must specify the required {@link Version} compatibility when creating
52         /// StandardAnalyzer:
53         /// <ul>
54         /// <li>As of 2.4, Tokens incorrectly identified as acronyms are corrected (see
55         /// <a href="https://issues.apache.org/jira/browse/LUCENE-1068">LUCENE-1608</a></li>
56         /// </ul>
57         /// </summary>
58         
59         public class StandardTokenizer:Tokenizer
60         {
61                 private void  InitBlock()
62                 {
63                         maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
64                 }
65                 /// <summary>A private instance of the JFlex-constructed scanner </summary>
66                 private StandardTokenizerImpl scanner;
67                 
68                 public const int ALPHANUM = 0;
69                 public const int APOSTROPHE = 1;
70                 public const int ACRONYM = 2;
71                 public const int COMPANY = 3;
72                 public const int EMAIL = 4;
73                 public const int HOST = 5;
74                 public const int NUM = 6;
75                 public const int CJ = 7;
76                 
77                 /// <deprecated> this solves a bug where HOSTs that end with '.' are identified
78                 /// as ACRONYMs. It is deprecated and will be removed in the next
79                 /// release.
80                 /// </deprecated>
81         [Obsolete("this solves a bug where HOSTs that end with '.' are identified as ACRONYMs. It is deprecated and will be removed in the next release.")]
82                 public const int ACRONYM_DEP = 8;
83                 
84                 /// <summary>String token types that correspond to token type int constants </summary>
85                 public static readonly System.String[] TOKEN_TYPES = new System.String[]{"<ALPHANUM>", "<APOSTROPHE>", "<ACRONYM>", "<COMPANY>", "<EMAIL>", "<HOST>", "<NUM>", "<CJ>", "<ACRONYM_DEP>"};
86                 
87                 /// <deprecated> Please use {@link #TOKEN_TYPES} instead 
88                 /// </deprecated>
89         [Obsolete("Please use TOKEN_TYPES instead")]
90                 public static readonly System.String[] tokenImage = TOKEN_TYPES;
91                 
92                 /// <summary> Specifies whether deprecated acronyms should be replaced with HOST type.
93                 /// This is false by default to support backward compatibility.
94                 /// <p/>
95                 /// See http://issues.apache.org/jira/browse/LUCENE-1068
96                 /// 
97                 /// </summary>
98                 /// <deprecated> this should be removed in the next release (3.0).
99                 /// </deprecated>
100         [Obsolete("this should be removed in the next release (3.0).")]
101                 private bool replaceInvalidAcronym;
102                 
103                 private int maxTokenLength;
104                 
105                 /// <summary>Set the max allowed token length.  Any token longer
106                 /// than this is skipped. 
107                 /// </summary>
108                 public virtual void  SetMaxTokenLength(int length)
109                 {
110                         this.maxTokenLength = length;
111                 }
112                 
113                 /// <seealso cref="setMaxTokenLength">
114                 /// </seealso>
115                 public virtual int GetMaxTokenLength()
116                 {
117                         return maxTokenLength;
118                 }
119                 
120                 /// <summary> Creates a new instance of the {@link StandardTokenizer}. Attaches the
121                 /// <code>input</code> to a newly created JFlex scanner.
122                 /// </summary>
123                 /// <deprecated> Use {@link #StandardTokenizer(Version, Reader)} instead
124                 /// </deprecated>
125         [Obsolete("Use StandardTokenizer(Version, Reader) instead")]
126                 public StandardTokenizer(System.IO.TextReader input):this(Version.LUCENE_24, input)
127                 {
128                 }
129                 
130                 /// <summary> Creates a new instance of the {@link Mono.Lucene.Net.Analysis.Standard.StandardTokenizer}.  Attaches
131                 /// the <code>input</code> to the newly created JFlex scanner.
132                 /// 
133                 /// </summary>
134                 /// <param name="input">The input reader
135                 /// </param>
136                 /// <param name="replaceInvalidAcronym">Set to true to replace mischaracterized acronyms with HOST.
137                 /// 
138                 /// See http://issues.apache.org/jira/browse/LUCENE-1068
139                 /// </param>
140                 /// <deprecated> Use {@link #StandardTokenizer(Version, Reader)} instead
141                 /// </deprecated>
142         [Obsolete("Use StandardTokenizer(Version, Reader) instead")]
143                 public StandardTokenizer(System.IO.TextReader input, bool replaceInvalidAcronym):base()
144                 {
145                         InitBlock();
146                         this.scanner = new StandardTokenizerImpl(input);
147                         Init(input, replaceInvalidAcronym);
148                 }
149                 
150                 /// <summary> Creates a new instance of the
151                 /// {@link org.apache.lucene.analysis.standard.StandardTokenizer}. Attaches
152                 /// the <code>input</code> to the newly created JFlex scanner.
153                 /// 
154                 /// </summary>
155                 /// <param name="input">The input reader
156                 /// 
157                 /// See http://issues.apache.org/jira/browse/LUCENE-1068
158                 /// </param>
159                 public StandardTokenizer(Version matchVersion, System.IO.TextReader input):base()
160                 {
161                         InitBlock();
162                         this.scanner = new StandardTokenizerImpl(input);
163                         Init(input, matchVersion);
164                 }
165                 
166                 /// <summary> Creates a new StandardTokenizer with a given {@link AttributeSource}. </summary>
167                 /// <deprecated> Use
168                 /// {@link #StandardTokenizer(Version, AttributeSource, Reader)}
169                 /// instead
170                 /// </deprecated>
171         [Obsolete("Use StandardTokenizer(Version, AttributeSource, Reader) instead")]
172                 public StandardTokenizer(AttributeSource source, System.IO.TextReader input, bool replaceInvalidAcronym):base(source)
173                 {
174                         InitBlock();
175                         this.scanner = new StandardTokenizerImpl(input);
176                         Init(input, replaceInvalidAcronym);
177                 }
178                 
179                 /// <summary> Creates a new StandardTokenizer with a given {@link AttributeSource}.</summary>
180                 public StandardTokenizer(Version matchVersion, AttributeSource source, System.IO.TextReader input):base(source)
181                 {
182                         InitBlock();
183                         this.scanner = new StandardTokenizerImpl(input);
184                         Init(input, matchVersion);
185                 }
186                 
187                 /// <summary> Creates a new StandardTokenizer with a given {@link Mono.Lucene.Net.Util.AttributeSource.AttributeFactory} </summary>
188                 /// <deprecated> Use
189                 /// {@link #StandardTokenizer(Version, org.apache.lucene.util.AttributeSource.AttributeFactory, Reader)}
190                 /// instead
191                 /// </deprecated>
192         [Obsolete("Use StandardTokenizer(Version, Mono.Lucene.Net.Util.AttributeSource.AttributeFactory, Reader) instead")]
193                 public StandardTokenizer(AttributeFactory factory, System.IO.TextReader input, bool replaceInvalidAcronym):base(factory)
194                 {
195                         InitBlock();
196                         this.scanner = new StandardTokenizerImpl(input);
197                         Init(input, replaceInvalidAcronym);
198                 }
199                 
200                 /// <summary> Creates a new StandardTokenizer with a given
201                 /// {@link org.apache.lucene.util.AttributeSource.AttributeFactory}
202                 /// </summary>
203                 public StandardTokenizer(Version matchVersion, AttributeFactory factory, System.IO.TextReader input):base(factory)
204                 {
205                         InitBlock();
206                         this.scanner = new StandardTokenizerImpl(input);
207                         Init(input, matchVersion);
208                 }
209                 
210                 private void  Init(System.IO.TextReader input, bool replaceInvalidAcronym)
211                 {
212                         this.replaceInvalidAcronym = replaceInvalidAcronym;
213                         this.input = input;
214                         termAtt = (TermAttribute) AddAttribute(typeof(TermAttribute));
215                         offsetAtt = (OffsetAttribute) AddAttribute(typeof(OffsetAttribute));
216                         posIncrAtt = (PositionIncrementAttribute) AddAttribute(typeof(PositionIncrementAttribute));
217                         typeAtt = (TypeAttribute) AddAttribute(typeof(TypeAttribute));
218                 }
219                 
220                 private void  Init(System.IO.TextReader input, Version matchVersion)
221                 {
222                         if (matchVersion.OnOrAfter(Version.LUCENE_24))
223                         {
224                                 Init(input, true);
225                         }
226                         else
227                         {
228                                 Init(input, false);
229                         }
230                 }
231                 
232                 // this tokenizer generates three attributes:
233                 // offset, positionIncrement and type
234                 private TermAttribute termAtt;
235                 private OffsetAttribute offsetAtt;
236                 private PositionIncrementAttribute posIncrAtt;
237                 private TypeAttribute typeAtt;
238                 
239                 /*
240                 * (non-Javadoc)
241                 *
242                 * @see Mono.Lucene.Net.Analysis.TokenStream#next()
243                 */
244                 public override bool IncrementToken()
245                 {
246                         ClearAttributes();
247                         int posIncr = 1;
248                         
249                         while (true)
250                         {
251                                 int tokenType = scanner.GetNextToken();
252                                 
253                                 if (tokenType == StandardTokenizerImpl.YYEOF)
254                                 {
255                                         return false;
256                                 }
257                                 
258                                 if (scanner.Yylength() <= maxTokenLength)
259                                 {
260                                         posIncrAtt.SetPositionIncrement(posIncr);
261                                         scanner.GetText(termAtt);
262                                         int start = scanner.Yychar();
263                                         offsetAtt.SetOffset(CorrectOffset(start), CorrectOffset(start + termAtt.TermLength()));
264                                         // This 'if' should be removed in the next release. For now, it converts
265                                         // invalid acronyms to HOST. When removed, only the 'else' part should
266                                         // remain.
267                                         if (tokenType == StandardTokenizerImpl.ACRONYM_DEP)
268                                         {
269                                                 if (replaceInvalidAcronym)
270                                                 {
271                                                         typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
272                                                         termAtt.SetTermLength(termAtt.TermLength() - 1); // remove extra '.'
273                                                 }
274                                                 else
275                                                 {
276                                                         typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
277                                                 }
278                                         }
279                                         else
280                                         {
281                                                 typeAtt.SetType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
282                                         }
283                                         return true;
284                                 }
285                                 // When we skip a too-long term, we still increment the
286                                 // position increment
287                                 else
288                                         posIncr++;
289                         }
290                 }
291                 
292                 public override void  End()
293                 {
294                         // set final offset
295                         int finalOffset = CorrectOffset(scanner.Yychar() + scanner.Yylength());
296                         offsetAtt.SetOffset(finalOffset, finalOffset);
297                 }
298                 
299                 /// <deprecated> Will be removed in Lucene 3.0. This method is final, as it should
300                 /// not be overridden. Delegates to the backwards compatibility layer. 
301                 /// </deprecated>
302         [Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer. ")]
303                 public override Token Next(Token reusableToken)
304                 {
305                         return base.Next(reusableToken);
306                 }
307                 
308                 /// <deprecated> Will be removed in Lucene 3.0. This method is final, as it should
309                 /// not be overridden. Delegates to the backwards compatibility layer. 
310                 /// </deprecated>
311         [Obsolete("Will be removed in Lucene 3.0. This method is final, as it should not be overridden. Delegates to the backwards compatibility layer. ")]
312                 public override Token Next()
313                 {
314                         return base.Next();
315                 }
316                 
317                                 
318                 public override void  Reset(System.IO.TextReader reader)
319                 {
320                         base.Reset(reader);
321                         scanner.Reset(reader);
322                 }
323                 
324                 /// <summary> Prior to https://issues.apache.org/jira/browse/LUCENE-1068, StandardTokenizer mischaracterized as acronyms tokens like www.abc.com
325                 /// when they should have been labeled as hosts instead.
326                 /// </summary>
327                 /// <returns> true if StandardTokenizer now returns these tokens as Hosts, otherwise false
328                 /// 
329                 /// </returns>
330                 /// <deprecated> Remove in 3.X and make true the only valid value
331                 /// </deprecated>
332         [Obsolete("Remove in 3.X and make true the only valid value")]
333                 public virtual bool IsReplaceInvalidAcronym()
334                 {
335                         return replaceInvalidAcronym;
336                 }
337                 
338                 /// <summary> </summary>
339                 /// <param name="replaceInvalidAcronym">Set to true to replace mischaracterized acronyms as HOST.
340                 /// </param>
341                 /// <deprecated> Remove in 3.X and make true the only valid value
342                 /// 
343                 /// See https://issues.apache.org/jira/browse/LUCENE-1068
344                 /// </deprecated>
345         [Obsolete("Remove in 3.X and make true the only valid value. See https://issues.apache.org/jira/browse/LUCENE-1068")]
346                 public virtual void  SetReplaceInvalidAcronym(bool replaceInvalidAcronym)
347                 {
348                         this.replaceInvalidAcronym = replaceInvalidAcronym;
349                 }
350         }
351 }