mcs/tools/monkeydoc/Lucene.Net/Lucene.Net/Search/SloppyPhraseScorer.cs

   1 /*
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  * http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 using System;
  19
  20 using TermPositions = Mono.Lucene.Net.Index.TermPositions;
  21
  22 namespace Mono.Lucene.Net.Search
  23 {
  24
  25         sealed class SloppyPhraseScorer:PhraseScorer
  26         {
  27                 private int slop;
  28                 private PhrasePositions[] repeats;
  29                 private PhrasePositions[] tmpPos; // for flipping repeating pps.
  30                 private bool checkedRepeats;
  31
  32                 internal SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, int slop, byte[] norms):base(weight, tps, offsets, similarity, norms)
  33                 {
  34                         this.slop = slop;
  35                 }
  36
  37                 /// <summary> Score a candidate doc for all slop-valid position-combinations (matches)
  38                 /// encountered while traversing/hopping the PhrasePositions.
  39                 /// <br/> The score contribution of a match depends on the distance:
  40                 /// <br/> - highest score for distance=0 (exact match).
  41                 /// <br/> - score gets lower as distance gets higher.
  42                 /// <br/>Example: for query "a b"~2, a document "x a b a y" can be scored twice:
  43                 /// once for "a b" (distance=0), and once for "b a" (distance=2).
  44                 /// <br/>Possibly not all valid combinations are encountered, because for efficiency
  45                 /// we always propagate the least PhrasePosition. This allows to base on
  46                 /// PriorityQueue and move forward faster.
  47                 /// As result, for example, document "a b c b a"
  48                 /// would score differently for queries "a b c"~4 and "c b a"~4, although
  49                 /// they really are equivalent.
  50                 /// Similarly, for doc "a b c b a f g", query "c b"~2
  51                 /// would get same score as "g f"~2, although "c b"~2 could be matched twice.
  52                 /// We may want to fix this in the future (currently not, for performance reasons).
  53                 /// </summary>
  54                 protected internal override float PhraseFreq()
  55                 {
  56                         int end = InitPhrasePositions();
  57
  58                         float freq = 0.0f;
  59                         bool done = (end < 0);
  60                         while (!done)
  61                         {
  62                                 PhrasePositions pp = (PhrasePositions) pq.Pop();
  63                                 int start = pp.position;
  64                                 int next = ((PhrasePositions) pq.Top()).position;
  65
  66                                 bool tpsDiffer = true;
  67                                 for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position)
  68                                 {
  69                                         if (pos <= next && tpsDiffer)
  70                                                 start = pos; // advance pp to min window
  71                                         if (!pp.NextPosition())
  72                                         {
  73                                                 done = true; // ran out of a term -- done
  74                                                 break;
  75                                         }
  76                                         PhrasePositions pp2 = null;
  77                                         tpsDiffer = !pp.repeats || (pp2 = TermPositionsDiffer(pp)) == null;
  78                                         if (pp2 != null && pp2 != pp)
  79                                         {
  80                                                 pp = Flip(pp, pp2); // flip pp to pp2
  81                                         }
  82                                 }
  83
  84                                 int matchLength = end - start;
  85                                 if (matchLength <= slop)
  86                                         freq += GetSimilarity().SloppyFreq(matchLength); // score match
  87
  88                                 if (pp.position > end)
  89                                         end = pp.position;
  90                                 pq.Put(pp); // restore pq
  91                         }
  92
  93                         return freq;
  94                 }
  95
  96                 // flip pp2 and pp in the queue: pop until finding pp2, insert back all but pp2, insert pp back.
  97                 // assumes: pp!=pp2, pp2 in pq, pp not in pq.
  98                 // called only when there are repeating pps.
  99                 private PhrasePositions Flip(PhrasePositions pp, PhrasePositions pp2)
 100                 {
 101                         int n = 0;
 102                         PhrasePositions pp3;
 103                         //pop until finding pp2
 104                         while ((pp3 = (PhrasePositions) pq.Pop()) != pp2)
 105                         {
 106                                 tmpPos[n++] = pp3;
 107                         }
 108                         //insert back all but pp2
 109                         for (n--; n >= 0; n--)
 110                         {
 111                                 pq.Insert(tmpPos[n]);
 112                         }
 113                         //insert pp back
 114                         pq.Put(pp);
 115                         return pp2;
 116                 }
 117
 118                 /// <summary> Init PhrasePositions in place.
 119                 /// There is a one time initialization for this scorer:
 120                 /// <br/>- Put in repeats[] each pp that has another pp with same position in the doc.
 121                 /// <br/>- Also mark each such pp by pp.repeats = true.
 122                 /// <br/>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
 123                 /// In particular, this allows to score queries with no repetitions with no overhead due to this computation.
 124                 /// <br/>- Example 1 - query with no repetitions: "ho my"~2
 125                 /// <br/>- Example 2 - query with repetitions: "ho my my"~2
 126                 /// <br/>- Example 3 - query with repetitions: "my ho my"~2
 127                 /// <br/>Init per doc w/repeats in query, includes propagating some repeating pp's to avoid false phrase detection.
 128                 /// </summary>
 129                 /// <returns> end (max position), or -1 if any term ran out (i.e. done)
 130                 /// </returns>
 131                 /// <throws>  IOException  </throws>
 132                 private int InitPhrasePositions()
 133                 {
 134                         int end = 0;
 135
 136                         // no repeats at all (most common case is also the simplest one)
 137                         if (checkedRepeats && repeats == null)
 138                         {
 139                                 // build queue from list
 140                                 pq.Clear();
 141                                 for (PhrasePositions pp = first; pp != null; pp = pp.next)
 142                                 {
 143                                         pp.FirstPosition();
 144                                         if (pp.position > end)
 145                                                 end = pp.position;
 146                                         pq.Put(pp); // build pq from list
 147                                 }
 148                                 return end;
 149                         }
 150
 151                         // position the pp's
 152                         for (PhrasePositions pp = first; pp != null; pp = pp.next)
 153                                 pp.FirstPosition();
 154
 155                         // one time initializatin for this scorer
 156                         if (!checkedRepeats)
 157                         {
 158                                 checkedRepeats = true;
 159                                 // check for repeats
 160                                 System.Collections.Hashtable m = null;
 161                                 for (PhrasePositions pp = first; pp != null; pp = pp.next)
 162                                 {
 163                                         int tpPos = pp.position + pp.offset;
 164                                         for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next)
 165                                         {
 166                                                 int tpPos2 = pp2.position + pp2.offset;
 167                                                 if (tpPos2 == tpPos)
 168                                                 {
 169                                                         if (m == null)
 170                                                         {
 171                                                                 m = new System.Collections.Hashtable();
 172                                                         }
 173                                                         pp.repeats = true;
 174                                                         pp2.repeats = true;
 175                                                         m[pp] = null;
 176                                                         m[pp2] = null;
 177                                                 }
 178                                         }
 179                                 }
 180                                 if (m != null)
 181                                 {
 182                                         repeats = (PhrasePositions[])(new System.Collections.ArrayList(m.Keys).ToArray(typeof(PhrasePositions)));
 183                                 }
 184                         }
 185
 186                         // with repeats must advance some repeating pp's so they all start with differing tp's
 187                         if (repeats != null)
 188                         {
 189                                 for (int i = 0; i < repeats.Length; i++)
 190                                 {
 191                                         PhrasePositions pp = repeats[i];
 192                                         PhrasePositions pp2;
 193                                         while ((pp2 = TermPositionsDiffer(pp)) != null)
 194                                         {
 195                                                 if (!pp2.NextPosition())
 196                                                 // out of pps that do not differ, advance the pp with higher offset
 197                                                         return - 1; // ran out of a term -- done
 198                                         }
 199                                 }
 200                         }
 201
 202                         // build queue from list
 203                         pq.Clear();
 204                         for (PhrasePositions pp = first; pp != null; pp = pp.next)
 205                         {
 206                                 if (pp.position > end)
 207                                         end = pp.position;
 208                                 pq.Put(pp); // build pq from list
 209                         }
 210
 211                         if (repeats != null)
 212                         {
 213                                 tmpPos = new PhrasePositions[pq.Size()];
 214                         }
 215                         return end;
 216                 }
 217
 218                 /// <summary> We disallow two pp's to have the same TermPosition, thereby verifying multiple occurrences
 219                 /// in the query of the same word would go elsewhere in the matched doc.
 220                 /// </summary>
 221                 /// <returns> null if differ (i.e. valid) otherwise return the higher offset PhrasePositions
 222                 /// out of the first two PPs found to not differ.
 223                 /// </returns>
 224                 private PhrasePositions TermPositionsDiffer(PhrasePositions pp)
 225                 {
 226                         // efficiency note: a more efficient implementation could keep a map between repeating
 227                         // pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats
 228                         // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c.
 229                         // However this would complicate code, for a rather rare case, so choice is to compromise here.
 230                         int tpPos = pp.position + pp.offset;
 231                         for (int i = 0; i < repeats.Length; i++)
 232                         {
 233                                 PhrasePositions pp2 = repeats[i];
 234                                 if (pp2 == pp)
 235                                         continue;
 236                                 int tpPos2 = pp2.position + pp2.offset;
 237                                 if (tpPos2 == tpPos)
 238                                         return pp.offset > pp2.offset?pp:pp2; // do not differ: return the one with higher offset.
 239                         }
 240                         return null;
 241                 }
 242         }
 243 }