2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
20 using TermPositions = Mono.Lucene.Net.Index.TermPositions;
22 namespace Mono.Lucene.Net.Search
25 sealed class SloppyPhraseScorer:PhraseScorer
28 private PhrasePositions[] repeats;
29 private PhrasePositions[] tmpPos; // for flipping repeating pps.
30 private bool checkedRepeats;
32 internal SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, int slop, byte[] norms):base(weight, tps, offsets, similarity, norms)
37 /// <summary> Score a candidate doc for all slop-valid position-combinations (matches)
38 /// encountered while traversing/hopping the PhrasePositions.
39 /// <br/> The score contribution of a match depends on the distance:
40 /// <br/> - highest score for distance=0 (exact match).
41 /// <br/> - score gets lower as distance gets higher.
42 /// <br/>Example: for query "a b"~2, a document "x a b a y" can be scored twice:
43 /// once for "a b" (distance=0), and once for "b a" (distance=2).
44 /// <br/>Possibly not all valid combinations are encountered, because for efficiency
45 /// we always propagate the least PhrasePosition. This allows to base on
46 /// PriorityQueue and move forward faster.
47 /// As result, for example, document "a b c b a"
48 /// would score differently for queries "a b c"~4 and "c b a"~4, although
49 /// they really are equivalent.
50 /// Similarly, for doc "a b c b a f g", query "c b"~2
51 /// would get same score as "g f"~2, although "c b"~2 could be matched twice.
52 /// We may want to fix this in the future (currently not, for performance reasons).
54 protected internal override float PhraseFreq()
56 int end = InitPhrasePositions();
59 bool done = (end < 0);
62 PhrasePositions pp = (PhrasePositions) pq.Pop();
63 int start = pp.position;
64 int next = ((PhrasePositions) pq.Top()).position;
66 bool tpsDiffer = true;
67 for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position)
69 if (pos <= next && tpsDiffer)
70 start = pos; // advance pp to min window
71 if (!pp.NextPosition())
73 done = true; // ran out of a term -- done
76 PhrasePositions pp2 = null;
77 tpsDiffer = !pp.repeats || (pp2 = TermPositionsDiffer(pp)) == null;
78 if (pp2 != null && pp2 != pp)
80 pp = Flip(pp, pp2); // flip pp to pp2
84 int matchLength = end - start;
85 if (matchLength <= slop)
86 freq += GetSimilarity().SloppyFreq(matchLength); // score match
88 if (pp.position > end)
90 pq.Put(pp); // restore pq
96 // flip pp2 and pp in the queue: pop until finding pp2, insert back all but pp2, insert pp back.
97 // assumes: pp!=pp2, pp2 in pq, pp not in pq.
98 // called only when there are repeating pps.
99 private PhrasePositions Flip(PhrasePositions pp, PhrasePositions pp2)
103 //pop until finding pp2
104 while ((pp3 = (PhrasePositions) pq.Pop()) != pp2)
108 //insert back all but pp2
109 for (n--; n >= 0; n--)
111 pq.Insert(tmpPos[n]);
118 /// <summary> Init PhrasePositions in place.
119 /// There is a one time initialization for this scorer:
120 /// <br/>- Put in repeats[] each pp that has another pp with same position in the doc.
121 /// <br/>- Also mark each such pp by pp.repeats = true.
122 /// <br/>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
123 /// In particular, this allows to score queries with no repetitions with no overhead due to this computation.
124 /// <br/>- Example 1 - query with no repetitions: "ho my"~2
125 /// <br/>- Example 2 - query with repetitions: "ho my my"~2
126 /// <br/>- Example 3 - query with repetitions: "my ho my"~2
127 /// <br/>Init per doc w/repeats in query, includes propagating some repeating pp's to avoid false phrase detection.
129 /// <returns> end (max position), or -1 if any term ran out (i.e. done)
131 /// <throws> IOException </throws>
132 private int InitPhrasePositions()
136 // no repeats at all (most common case is also the simplest one)
137 if (checkedRepeats && repeats == null)
139 // build queue from list
141 for (PhrasePositions pp = first; pp != null; pp = pp.next)
144 if (pp.position > end)
146 pq.Put(pp); // build pq from list
152 for (PhrasePositions pp = first; pp != null; pp = pp.next)
155 // one time initializatin for this scorer
158 checkedRepeats = true;
160 System.Collections.Hashtable m = null;
161 for (PhrasePositions pp = first; pp != null; pp = pp.next)
163 int tpPos = pp.position + pp.offset;
164 for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next)
166 int tpPos2 = pp2.position + pp2.offset;
171 m = new System.Collections.Hashtable();
182 repeats = (PhrasePositions[])(new System.Collections.ArrayList(m.Keys).ToArray(typeof(PhrasePositions)));
186 // with repeats must advance some repeating pp's so they all start with differing tp's
189 for (int i = 0; i < repeats.Length; i++)
191 PhrasePositions pp = repeats[i];
193 while ((pp2 = TermPositionsDiffer(pp)) != null)
195 if (!pp2.NextPosition())
196 // out of pps that do not differ, advance the pp with higher offset
197 return - 1; // ran out of a term -- done
202 // build queue from list
204 for (PhrasePositions pp = first; pp != null; pp = pp.next)
206 if (pp.position > end)
208 pq.Put(pp); // build pq from list
213 tmpPos = new PhrasePositions[pq.Size()];
218 /// <summary> We disallow two pp's to have the same TermPosition, thereby verifying multiple occurrences
219 /// in the query of the same word would go elsewhere in the matched doc.
221 /// <returns> null if differ (i.e. valid) otherwise return the higher offset PhrasePositions
222 /// out of the first two PPs found to not differ.
224 private PhrasePositions TermPositionsDiffer(PhrasePositions pp)
226 // efficiency note: a more efficient implementation could keep a map between repeating
227 // pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats
228 // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c.
229 // However this would complicate code, for a rather rare case, so choice is to compromise here.
230 int tpPos = pp.position + pp.offset;
231 for (int i = 0; i < repeats.Length; i++)
233 PhrasePositions pp2 = repeats[i];
236 int tpPos2 = pp2.position + pp2.offset;
238 return pp.offset > pp2.offset?pp:pp2; // do not differ: return the one with higher offset.