1 package org.apache.lucene.analysis.standard;
\r
4 * Licensed to the Apache Software Foundation (ASF) under one or more
\r
5 * contributor license agreements. See the NOTICE file distributed with
\r
6 * this work for additional information regarding copyright ownership.
\r
7 * The ASF licenses this file to You under the Apache License, Version 2.0
\r
8 * (the "License"); you may not use this file except in compliance with
\r
9 * the License. You may obtain a copy of the License at
\r
11 * http://www.apache.org/licenses/LICENSE-2.0
\r
13 * Unless required by applicable law or agreed to in writing, software
\r
14 * distributed under the License is distributed on an "AS IS" BASIS,
\r
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
\r
16 * See the License for the specific language governing permissions and
\r
17 * limitations under the License.
\r
22 NOTE: if you change StandardTokenizerImpl.jflex and need to regenerate
\r
23 the tokenizer, remember to use JRE 1.4 to run jflex (before
\r
24 Lucene 3.0). This grammar now uses constructs (eg :digit:,
\r
25 :letter:) whose meaning can vary according to the JRE used to
\r
27 https://issues.apache.org/jira/browse/LUCENE-1126 for details.
\r
31 import org.apache.lucene.analysis.Token;
\r
32 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
\r
36 %class StandardTokenizerImpl
\r
39 %function getNextToken
\r
45 public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
\r
46 public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
\r
47 public static final int ACRONYM = StandardTokenizer.ACRONYM;
\r
48 public static final int COMPANY = StandardTokenizer.COMPANY;
\r
49 public static final int EMAIL = StandardTokenizer.EMAIL;
\r
50 public static final int HOST = StandardTokenizer.HOST;
\r
51 public static final int NUM = StandardTokenizer.NUM;
\r
52 public static final int CJ = StandardTokenizer.CJ;
\r
54 * @deprecated this solves a bug where HOSTs that end with '.' are identified
\r
55 * as ACRONYMs. It is deprecated and will be removed in the next
\r
58 public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
\r
60 public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
\r
62 public final int yychar()
\r
68 * Resets the Tokenizer to a new Reader.
\r
70 final void reset(java.io.Reader r) {
\r
71 // reset to default buffer size, if buffer has grown
\r
72 if (zzBuffer.length > ZZ_BUFFERSIZE) {
\r
73 zzBuffer = new char[ZZ_BUFFERSIZE];
\r
79 * Fills Lucene token with the current token text.
\r
81 final void getText(Token t) {
\r
82 t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
\r
86 * Fills TermAttribute with the current token text.
\r
88 final void getText(TermAttribute t) {
\r
89 t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
\r
94 THAI = [\u0E00-\u0E59]
\r
96 // basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
\r
97 ALPHANUM = ({LETTER}|{THAI}|[:digit:])+
\r
99 // internal apostrophes: O'Reilly, you're, O'Reilly's
\r
100 // use a post-filter to remove possessives
\r
101 APOSTROPHE = {ALPHA} ("'" {ALPHA})+
\r
103 // acronyms: U.S.A., I.B.M., etc.
\r
104 // use a post-filter to remove dots
\r
105 ACRONYM = {LETTER} "." ({LETTER} ".")+
\r
107 ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
\r
109 // company names like AT&T and Excite@Home.
\r
110 COMPANY = {ALPHA} ("&"|"@") {ALPHA}
\r
113 EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
\r
116 HOST = {ALPHANUM} ((".") {ALPHANUM})+
\r
118 // floating point, serial, model numbers, ip addresses, etc.
\r
119 // every other segment must have at least one digit
\r
120 NUM = ({ALPHANUM} {P} {HAS_DIGIT}
\r
121 | {HAS_DIGIT} {P} {ALPHANUM}
\r
122 | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
\r
123 | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
\r
124 | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
\r
125 | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
\r
128 P = ("_"|"-"|"/"|"."|",")
\r
130 // at least one digit
\r
131 HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*
\r
133 ALPHA = ({LETTER})+
\r
135 // From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"
\r
136 LETTER = !(![:letter:]|{CJ})
\r
138 // Chinese and Japanese (but NOT Korean, which is included in [:letter:])
\r
139 CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
\r
141 WHITESPACE = \r\n | [ \r\n\t\f]
\r
145 {ALPHANUM} { return ALPHANUM; }
\r
146 {APOSTROPHE} { return APOSTROPHE; }
\r
147 {ACRONYM} { return ACRONYM; }
\r
148 {COMPANY} { return COMPANY; }
\r
149 {EMAIL} { return EMAIL; }
\r
150 {HOST} { return HOST; }
\r
151 {NUM} { return NUM; }
\r
152 {CJ} { return CJ; }
\r
153 {ACRONYM_DEP} { return ACRONYM_DEP; }
\r
155 /** Ignore the rest */
\r
156 . | {WHITESPACE} { /* ignore */ }
\r