mcs/tools/monkeydoc/Lucene.Net/Lucene.Net/Analysis/Standard/StandardTokenizerImpl.jflex

   1 package org.apache.lucene.analysis.standard;\r
   2 \r
   3 /**\r
   4  * Licensed to the Apache Software Foundation (ASF) under one or more\r
   5  * contributor license agreements.  See the NOTICE file distributed with\r
   6  * this work for additional information regarding copyright ownership.\r
   7  * The ASF licenses this file to You under the Apache License, Version 2.0\r
   8  * (the "License"); you may not use this file except in compliance with\r
   9  * the License.  You may obtain a copy of the License at\r
  10  *\r
  11  *     http://www.apache.org/licenses/LICENSE-2.0\r
  12  *\r
  13  * Unless required by applicable law or agreed to in writing, software\r
  14  * distributed under the License is distributed on an "AS IS" BASIS,\r
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r
  16  * See the License for the specific language governing permissions and\r
  17  * limitations under the License.\r
  18  */\r
  19 \r
  20 /*\r
  21 \r
  22 NOTE: if you change StandardTokenizerImpl.jflex and need to regenerate\r
  23       the tokenizer, remember to use JRE 1.4 to run jflex (before\r
  24       Lucene 3.0).  This grammar now uses constructs (eg :digit:,\r
  25       :letter:) whose meaning can vary according to the JRE used to\r
  26       run jflex.  See\r
  27       https://issues.apache.org/jira/browse/LUCENE-1126 for details.\r
  28 \r
  29 */\r
  30 \r
  31 import org.apache.lucene.analysis.Token;\r
  32 import org.apache.lucene.analysis.tokenattributes.TermAttribute;\r
  33 \r
  34 %%\r
  35 \r
  36 %class StandardTokenizerImpl\r
  37 %unicode\r
  38 %integer\r
  39 %function getNextToken\r
  40 %pack\r
  41 %char\r
  42 \r
  43 %{\r
  44 \r
  45 public static final int ALPHANUM          = StandardTokenizer.ALPHANUM;\r
  46 public static final int APOSTROPHE        = StandardTokenizer.APOSTROPHE;\r
  47 public static final int ACRONYM           = StandardTokenizer.ACRONYM;\r
  48 public static final int COMPANY           = StandardTokenizer.COMPANY;\r
  49 public static final int EMAIL             = StandardTokenizer.EMAIL;\r
  50 public static final int HOST              = StandardTokenizer.HOST;\r
  51 public static final int NUM               = StandardTokenizer.NUM;\r
  52 public static final int CJ                = StandardTokenizer.CJ;\r
  53 /**\r
  54  * @deprecated this solves a bug where HOSTs that end with '.' are identified\r
  55  *             as ACRONYMs. It is deprecated and will be removed in the next\r
  56  *             release.\r
  57  */\r
  58 public static final int ACRONYM_DEP       = StandardTokenizer.ACRONYM_DEP;\r
  59 \r
  60 public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;\r
  61 \r
  62 public final int yychar()\r
  63 {\r
  64     return yychar;\r
  65 }\r
  66 \r
  67 /**\r
  68  * Resets the Tokenizer to a new Reader.\r
  69  */\r
  70 final void reset(java.io.Reader r) {\r
  71   // reset to default buffer size, if buffer has grown\r
  72   if (zzBuffer.length > ZZ_BUFFERSIZE) {\r
  73     zzBuffer = new char[ZZ_BUFFERSIZE];\r
  74   }\r
  75   yyreset(r);\r
  76 }\r
  77 \r
  78 /**\r
  79  * Fills Lucene token with the current token text.\r
  80  */\r
  81 final void getText(Token t) {\r
  82   t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);\r
  83 }\r
  84 \r
  85 /**\r
  86  * Fills TermAttribute with the current token text.\r
  87  */\r
  88 final void getText(TermAttribute t) {\r
  89   t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);\r
  90 }\r
  91 \r
  92 %}\r
  93 \r
  94 THAI       = [\u0E00-\u0E59]\r
  95 \r
  96 // basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)\r
  97 ALPHANUM   = ({LETTER}|{THAI}|[:digit:])+\r
  98 \r
  99 // internal apostrophes: O'Reilly, you're, O'Reilly's\r
 100 // use a post-filter to remove possessives\r
 101 APOSTROPHE =  {ALPHA} ("'" {ALPHA})+\r
 102 \r
 103 // acronyms: U.S.A., I.B.M., etc.\r
 104 // use a post-filter to remove dots\r
 105 ACRONYM    =  {LETTER} "." ({LETTER} ".")+\r
 106 \r
 107 ACRONYM_DEP     = {ALPHANUM} "." ({ALPHANUM} ".")+\r
 108 \r
 109 // company names like AT&T and Excite@Home.\r
 110 COMPANY    =  {ALPHA} ("&"|"@") {ALPHA}\r
 111 \r
 112 // email addresses\r
 113 EMAIL      =  {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+\r
 114 \r
 115 // hostname\r
 116 HOST       =  {ALPHANUM} ((".") {ALPHANUM})+\r
 117 \r
 118 // floating point, serial, model numbers, ip addresses, etc.\r
 119 // every other segment must have at least one digit\r
 120 NUM        = ({ALPHANUM} {P} {HAS_DIGIT}\r
 121            | {HAS_DIGIT} {P} {ALPHANUM}\r
 122            | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+\r
 123            | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+\r
 124            | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+\r
 125            | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)\r
 126 \r
 127 // punctuation\r
 128 P                = ("_"|"-"|"/"|"."|",")\r
 129 \r
 130 // at least one digit\r
 131 HAS_DIGIT  = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*\r
 132 \r
 133 ALPHA      = ({LETTER})+\r
 134 \r
 135 // From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"\r
 136 LETTER     = !(![:letter:]|{CJ})\r
 137 \r
 138 // Chinese and Japanese (but NOT Korean, which is included in [:letter:])\r
 139 CJ         = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]\r
 140 \r
 141 WHITESPACE = \r\n | [ \r\n\t\f]\r
 142 \r
 143 %%\r
 144 \r
 145 {ALPHANUM}                                                     { return ALPHANUM; }\r
 146 {APOSTROPHE}                                                   { return APOSTROPHE; }\r
 147 {ACRONYM}                                                      { return ACRONYM; }\r
 148 {COMPANY}                                                      { return COMPANY; }\r
 149 {EMAIL}                                                        { return EMAIL; }\r
 150 {HOST}                                                         { return HOST; }\r
 151 {NUM}                                                          { return NUM; }\r
 152 {CJ}                                                           { return CJ; }\r
 153 {ACRONYM_DEP}                                                  { return ACRONYM_DEP; }\r
 154 \r
 155 /** Ignore the rest */\r
 156 . | {WHITESPACE}                                               { /* ignore */ }\r