Merge pull request #409 from Alkarex/patch-1
[mono.git] / mcs / tools / monkeydoc / Lucene.Net / Lucene.Net / Search / SloppyPhraseScorer.cs
1 /* 
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  * 
9  * http://www.apache.org/licenses/LICENSE-2.0
10  * 
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17
18 using System;
19
20 using TermPositions = Mono.Lucene.Net.Index.TermPositions;
21
22 namespace Mono.Lucene.Net.Search
23 {
24         
25         sealed class SloppyPhraseScorer:PhraseScorer
26         {
27                 private int slop;
28                 private PhrasePositions[] repeats;
29                 private PhrasePositions[] tmpPos; // for flipping repeating pps.
30                 private bool checkedRepeats;
31                 
32                 internal SloppyPhraseScorer(Weight weight, TermPositions[] tps, int[] offsets, Similarity similarity, int slop, byte[] norms):base(weight, tps, offsets, similarity, norms)
33                 {
34                         this.slop = slop;
35                 }
36                 
37                 /// <summary> Score a candidate doc for all slop-valid position-combinations (matches) 
38                 /// encountered while traversing/hopping the PhrasePositions.
39                 /// <br/> The score contribution of a match depends on the distance: 
40                 /// <br/> - highest score for distance=0 (exact match).
41                 /// <br/> - score gets lower as distance gets higher.
42                 /// <br/>Example: for query "a b"~2, a document "x a b a y" can be scored twice: 
43                 /// once for "a b" (distance=0), and once for "b a" (distance=2).
44                 /// <br/>Possibly not all valid combinations are encountered, because for efficiency  
45                 /// we always propagate the least PhrasePosition. This allows to base on 
46                 /// PriorityQueue and move forward faster. 
47                 /// As result, for example, document "a b c b a"
48                 /// would score differently for queries "a b c"~4 and "c b a"~4, although 
49                 /// they really are equivalent. 
50                 /// Similarly, for doc "a b c b a f g", query "c b"~2 
51                 /// would get same score as "g f"~2, although "c b"~2 could be matched twice.
52                 /// We may want to fix this in the future (currently not, for performance reasons).
53                 /// </summary>
54                 protected internal override float PhraseFreq()
55                 {
56                         int end = InitPhrasePositions();
57                         
58                         float freq = 0.0f;
59                         bool done = (end < 0);
60                         while (!done)
61                         {
62                                 PhrasePositions pp = (PhrasePositions) pq.Pop();
63                                 int start = pp.position;
64                                 int next = ((PhrasePositions) pq.Top()).position;
65                                 
66                                 bool tpsDiffer = true;
67                                 for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position)
68                                 {
69                                         if (pos <= next && tpsDiffer)
70                                                 start = pos; // advance pp to min window
71                                         if (!pp.NextPosition())
72                                         {
73                                                 done = true; // ran out of a term -- done
74                                                 break;
75                                         }
76                                         PhrasePositions pp2 = null;
77                                         tpsDiffer = !pp.repeats || (pp2 = TermPositionsDiffer(pp)) == null;
78                                         if (pp2 != null && pp2 != pp)
79                                         {
80                                                 pp = Flip(pp, pp2); // flip pp to pp2
81                                         }
82                                 }
83                                 
84                                 int matchLength = end - start;
85                                 if (matchLength <= slop)
86                                         freq += GetSimilarity().SloppyFreq(matchLength); // score match
87                                 
88                                 if (pp.position > end)
89                                         end = pp.position;
90                                 pq.Put(pp); // restore pq
91                         }
92                         
93                         return freq;
94                 }
95                 
96                 // flip pp2 and pp in the queue: pop until finding pp2, insert back all but pp2, insert pp back.
97                 // assumes: pp!=pp2, pp2 in pq, pp not in pq.
98                 // called only when there are repeating pps.
99                 private PhrasePositions Flip(PhrasePositions pp, PhrasePositions pp2)
100                 {
101                         int n = 0;
102                         PhrasePositions pp3;
103                         //pop until finding pp2
104                         while ((pp3 = (PhrasePositions) pq.Pop()) != pp2)
105                         {
106                                 tmpPos[n++] = pp3;
107                         }
108                         //insert back all but pp2
109                         for (n--; n >= 0; n--)
110                         {
111                                 pq.Insert(tmpPos[n]);
112                         }
113                         //insert pp back
114                         pq.Put(pp);
115                         return pp2;
116                 }
117                 
118                 /// <summary> Init PhrasePositions in place.
119                 /// There is a one time initialization for this scorer:
120                 /// <br/>- Put in repeats[] each pp that has another pp with same position in the doc.
121                 /// <br/>- Also mark each such pp by pp.repeats = true.
122                 /// <br/>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
123                 /// In particular, this allows to score queries with no repetitions with no overhead due to this computation.
124                 /// <br/>- Example 1 - query with no repetitions: "ho my"~2
125                 /// <br/>- Example 2 - query with repetitions: "ho my my"~2
126                 /// <br/>- Example 3 - query with repetitions: "my ho my"~2
127                 /// <br/>Init per doc w/repeats in query, includes propagating some repeating pp's to avoid false phrase detection.  
128                 /// </summary>
129                 /// <returns> end (max position), or -1 if any term ran out (i.e. done) 
130                 /// </returns>
131                 /// <throws>  IOException  </throws>
132                 private int InitPhrasePositions()
133                 {
134                         int end = 0;
135                         
136                         // no repeats at all (most common case is also the simplest one)
137                         if (checkedRepeats && repeats == null)
138                         {
139                                 // build queue from list
140                                 pq.Clear();
141                                 for (PhrasePositions pp = first; pp != null; pp = pp.next)
142                                 {
143                                         pp.FirstPosition();
144                                         if (pp.position > end)
145                                                 end = pp.position;
146                                         pq.Put(pp); // build pq from list
147                                 }
148                                 return end;
149                         }
150                         
151                         // position the pp's
152                         for (PhrasePositions pp = first; pp != null; pp = pp.next)
153                                 pp.FirstPosition();
154                         
155                         // one time initializatin for this scorer
156                         if (!checkedRepeats)
157                         {
158                                 checkedRepeats = true;
159                                 // check for repeats
160                                 System.Collections.Hashtable m = null;
161                                 for (PhrasePositions pp = first; pp != null; pp = pp.next)
162                                 {
163                                         int tpPos = pp.position + pp.offset;
164                                         for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next)
165                                         {
166                                                 int tpPos2 = pp2.position + pp2.offset;
167                                                 if (tpPos2 == tpPos)
168                                                 {
169                                                         if (m == null)
170                                                         {
171                                                                 m = new System.Collections.Hashtable();
172                                                         }
173                                                         pp.repeats = true;
174                                                         pp2.repeats = true;
175                                                         m[pp] = null;
176                                                         m[pp2] = null;
177                                                 }
178                                         }
179                                 }
180                                 if (m != null)
181                                 {
182                                         repeats = (PhrasePositions[])(new System.Collections.ArrayList(m.Keys).ToArray(typeof(PhrasePositions)));
183                                 }
184                         }
185                         
186                         // with repeats must advance some repeating pp's so they all start with differing tp's       
187                         if (repeats != null)
188                         {
189                                 for (int i = 0; i < repeats.Length; i++)
190                                 {
191                                         PhrasePositions pp = repeats[i];
192                                         PhrasePositions pp2;
193                                         while ((pp2 = TermPositionsDiffer(pp)) != null)
194                                         {
195                                                 if (!pp2.NextPosition())
196                                                 // out of pps that do not differ, advance the pp with higher offset 
197                                                         return - 1; // ran out of a term -- done  
198                                         }
199                                 }
200                         }
201                         
202                         // build queue from list
203                         pq.Clear();
204                         for (PhrasePositions pp = first; pp != null; pp = pp.next)
205                         {
206                                 if (pp.position > end)
207                                         end = pp.position;
208                                 pq.Put(pp); // build pq from list
209                         }
210                         
211                         if (repeats != null)
212                         {
213                                 tmpPos = new PhrasePositions[pq.Size()];
214                         }
215                         return end;
216                 }
217                 
218                 /// <summary> We disallow two pp's to have the same TermPosition, thereby verifying multiple occurrences 
219                 /// in the query of the same word would go elsewhere in the matched doc.
220                 /// </summary>
221                 /// <returns> null if differ (i.e. valid) otherwise return the higher offset PhrasePositions
222                 /// out of the first two PPs found to not differ.
223                 /// </returns>
224                 private PhrasePositions TermPositionsDiffer(PhrasePositions pp)
225                 {
226                         // efficiency note: a more efficient implementation could keep a map between repeating 
227                         // pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats 
228                         // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c. 
229                         // However this would complicate code, for a rather rare case, so choice is to compromise here.
230                         int tpPos = pp.position + pp.offset;
231                         for (int i = 0; i < repeats.Length; i++)
232                         {
233                                 PhrasePositions pp2 = repeats[i];
234                                 if (pp2 == pp)
235                                         continue;
236                                 int tpPos2 = pp2.position + pp2.offset;
237                                 if (tpPos2 == tpPos)
238                                         return pp.offset > pp2.offset?pp:pp2; // do not differ: return the one with higher offset.
239                         }
240                         return null;
241                 }
242         }
243 }