Merge pull request #409 from Alkarex/patch-1
[mono.git] / mcs / tools / monkeydoc / Lucene.Net / Lucene.Net / Index / LogMergePolicy.cs
1 /* 
2  * Licensed to the Apache Software Foundation (ASF) under one or more
3  * contributor license agreements.  See the NOTICE file distributed with
4  * this work for additional information regarding copyright ownership.
5  * The ASF licenses this file to You under the Apache License, Version 2.0
6  * (the "License"); you may not use this file except in compliance with
7  * the License.  You may obtain a copy of the License at
8  * 
9  * http://www.apache.org/licenses/LICENSE-2.0
10  * 
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17
18 using System;
19
20 namespace Mono.Lucene.Net.Index
21 {
22         
23         /// <summary><p/>This class implements a {@link MergePolicy} that tries
24         /// to merge segments into levels of exponentially
25         /// increasing size, where each level has fewer segments than
26         /// the value of the merge factor. Whenever extra segments
27         /// (beyond the merge factor upper bound) are encountered,
28         /// all segments within the level are merged. You can get or
29         /// set the merge factor using {@link #GetMergeFactor()} and
30         /// {@link #SetMergeFactor(int)} respectively.<p/>
31         /// 
32         /// <p/>This class is abstract and requires a subclass to
33         /// define the {@link #size} method which specifies how a
34         /// segment's size is determined.  {@link LogDocMergePolicy}
35         /// is one subclass that measures size by document count in
36         /// the segment.  {@link LogByteSizeMergePolicy} is another
37         /// subclass that measures size as the total byte size of the
38         /// file(s) for the segment.<p/>
39         /// </summary>
40         
41         public abstract class LogMergePolicy:MergePolicy
42         {
43                 
44                 /// <summary>Defines the allowed range of log(size) for each
45                 /// level.  A level is computed by taking the max segment
46                 /// log size, minus LEVEL_LOG_SPAN, and finding all
47                 /// segments falling within that range. 
48                 /// </summary>
49                 public const double LEVEL_LOG_SPAN = 0.75;
50                 
51                 /// <summary>Default merge factor, which is how many segments are
52                 /// merged at a time 
53                 /// </summary>
54                 public const int DEFAULT_MERGE_FACTOR = 10;
55                 
56                 /// <summary>Default maximum segment size.  A segment of this size</summary>
57                 /// <seealso cref="setMaxMergeDocs">
58                 /// </seealso>
59                 public static readonly int DEFAULT_MAX_MERGE_DOCS = System.Int32.MaxValue;
60
61         /// <summary> Default noCFSRatio.  If a merge's size is >= 10% of
62         ///  the index, then we disable compound file for it.
63         ///  @see #setNoCFSRatio 
64         ///  </summary>
65         public static double DEFAULT_NO_CFS_RATIO = 0.1;
66                 
67                 private int mergeFactor = DEFAULT_MERGE_FACTOR;
68                 
69                 internal long minMergeSize;
70                 internal long maxMergeSize;
71                 internal int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
72
73         protected double noCFSRatio = DEFAULT_NO_CFS_RATIO;
74                 
75                 /* TODO 3.0: change this default to true */
76                 protected internal bool calibrateSizeByDeletes = false;
77                 
78                 private bool useCompoundFile = true;
79                 private bool useCompoundDocStore = true;
80                 
81                 public LogMergePolicy(IndexWriter writer):base(writer)
82                 {
83                 }
84                 
85                 protected internal virtual bool Verbose()
86                 {
87                         return writer != null && writer.Verbose();
88                 }
89
90
91         /** @see #setNoCFSRatio */
92         public double GetNoCFSRatio()
93         {
94             return noCFSRatio;
95         }
96
97         /** If a merged segment will be more than this percentage
98          *  of the total size of the index, leave the segment as
99          *  non-compound file even if compound file is enabled.
100          *  Set to 1.0 to always use CFS regardless of merge
101          *  size. */
102         public void SetNoCFSRatio(double noCFSRatio)
103         {
104             if (noCFSRatio < 0.0 || noCFSRatio > 1.0)
105             {
106                 throw new ArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + noCFSRatio);
107             }
108             this.noCFSRatio = noCFSRatio;
109         }
110                 
111                 private void  Message(System.String message)
112                 {
113                         if (Verbose())
114                                 writer.Message("LMP: " + message);
115                 }
116                 
117                 /// <summary><p/>Returns the number of segments that are merged at
118                 /// once and also controls the total number of segments
119                 /// allowed to accumulate in the index.<p/> 
120                 /// </summary>
121                 public virtual int GetMergeFactor()
122                 {
123                         return mergeFactor;
124                 }
125                 
126                 /// <summary>Determines how often segment indices are merged by
127                 /// addDocument().  With smaller values, less RAM is used
128                 /// while indexing, and searches on unoptimized indices are
129                 /// faster, but indexing speed is slower.  With larger
130                 /// values, more RAM is used during indexing, and while
131                 /// searches on unoptimized indices are slower, indexing is
132         /// faster.  Thus larger values (&gt; 10) are best for batch
133         /// index creation, and smaller values (&lt; 10) for indices
134                 /// that are interactively maintained. 
135                 /// </summary>
136                 public virtual void  SetMergeFactor(int mergeFactor)
137                 {
138                         if (mergeFactor < 2)
139                                 throw new System.ArgumentException("mergeFactor cannot be less than 2");
140                         this.mergeFactor = mergeFactor;
141                 }
142                 
143                 // Javadoc inherited
144                 public override bool UseCompoundFile(SegmentInfos infos, SegmentInfo info)
145                 {
146                         return useCompoundFile;
147                 }
148                 
149                 /// <summary>Sets whether compound file format should be used for
150                 /// newly flushed and newly merged segments. 
151                 /// </summary>
152                 public virtual void  SetUseCompoundFile(bool useCompoundFile)
153                 {
154                         this.useCompoundFile = useCompoundFile;
155                 }
156                 
157                 /// <summary>Returns true if newly flushed and newly merge segments</summary>
158         /// <seealso cref="SetUseCompoundFile">
159                 /// </seealso>
160                 public virtual bool GetUseCompoundFile()
161                 {
162                         return useCompoundFile;
163                 }
164                 
165                 // Javadoc inherited
166                 public override bool UseCompoundDocStore(SegmentInfos infos)
167                 {
168                         return useCompoundDocStore;
169                 }
170                 
171                 /// <summary>Sets whether compound file format should be used for
172                 /// newly flushed and newly merged doc store
173                 /// segment files (term vectors and stored fields). 
174                 /// </summary>
175                 public virtual void  SetUseCompoundDocStore(bool useCompoundDocStore)
176                 {
177                         this.useCompoundDocStore = useCompoundDocStore;
178                 }
179                 
180                 /// <summary>Returns true if newly flushed and newly merge doc
181                 /// store segment files (term vectors and stored fields)
182                 /// </summary>
183         /// <seealso cref="SetUseCompoundDocStore ">
184                 /// </seealso>
185                 public virtual bool GetUseCompoundDocStore()
186                 {
187                         return useCompoundDocStore;
188                 }
189                 
190                 /// <summary>Sets whether the segment size should be calibrated by
191                 /// the number of deletes when choosing segments for merge. 
192                 /// </summary>
193                 public virtual void  SetCalibrateSizeByDeletes(bool calibrateSizeByDeletes)
194                 {
195                         this.calibrateSizeByDeletes = calibrateSizeByDeletes;
196                 }
197                 
198                 /// <summary>Returns true if the segment size should be calibrated 
199                 /// by the number of deletes when choosing segments for merge. 
200                 /// </summary>
201                 public virtual bool GetCalibrateSizeByDeletes()
202                 {
203                         return calibrateSizeByDeletes;
204                 }
205                 
206                 public override void  Close()
207                 {
208                 }
209                 
210                 abstract protected internal long Size(SegmentInfo info);
211                 
212                 protected internal virtual long SizeDocs(SegmentInfo info)
213                 {
214                         if (calibrateSizeByDeletes)
215                         {
216                                 int delCount = writer.NumDeletedDocs(info);
217                                 return (info.docCount - (long) delCount);
218                         }
219                         else
220                         {
221                                 return info.docCount;
222                         }
223                 }
224                 
225                 protected internal virtual long SizeBytes(SegmentInfo info)
226                 {
227                         long byteSize = info.SizeInBytes();
228                         if (calibrateSizeByDeletes)
229                         {
230                                 int delCount = writer.NumDeletedDocs(info);
231                                 float delRatio = (info.docCount <= 0?0.0f:((float) delCount / (float) info.docCount));
232                                 return (info.docCount <= 0?byteSize:(long) (byteSize * (1.0f - delRatio)));
233                         }
234                         else
235                         {
236                                 return byteSize;
237                         }
238                 }
239                 
240                 private bool IsOptimized(SegmentInfos infos, int maxNumSegments, System.Collections.Hashtable segmentsToOptimize)
241                 {
242                         int numSegments = infos.Count;
243                         int numToOptimize = 0;
244                         SegmentInfo optimizeInfo = null;
245                         for (int i = 0; i < numSegments && numToOptimize <= maxNumSegments; i++)
246                         {
247                                 SegmentInfo info = infos.Info(i);
248                                 if (segmentsToOptimize.Contains(info))
249                                 {
250                                         numToOptimize++;
251                                         optimizeInfo = info;
252                                 }
253                         }
254                         
255                         return numToOptimize <= maxNumSegments && (numToOptimize != 1 || IsOptimized(optimizeInfo));
256                 }
257                 
258                 /// <summary>Returns true if this single info is optimized (has no
259                 /// pending norms or deletes, is in the same dir as the
260                 /// writer, and matches the current compound file setting 
261                 /// </summary>
262                 private bool IsOptimized(SegmentInfo info)
263                 {
264                         bool hasDeletions = writer.NumDeletedDocs(info) > 0;
265                         return !hasDeletions && !info.HasSeparateNorms() && info.dir == writer.GetDirectory() &&
266                 (info.GetUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0);
267                 }
268                 
269                 /// <summary>Returns the merges necessary to optimize the index.
270                 /// This merge policy defines "optimized" to mean only one
271                 /// segment in the index, where that segment has no
272                 /// deletions pending nor separate norms, and it is in
273                 /// compound file format if the current useCompoundFile
274                 /// setting is true.  This method returns multiple merges
275                 /// (mergeFactor at a time) so the {@link MergeScheduler}
276                 /// in use may make use of concurrency. 
277                 /// </summary>
278                 public override MergeSpecification FindMergesForOptimize(SegmentInfos infos, int maxNumSegments, System.Collections.Hashtable segmentsToOptimize)
279                 {
280                         MergeSpecification spec;
281                         
282                         System.Diagnostics.Debug.Assert(maxNumSegments > 0);
283                         
284                         if (!IsOptimized(infos, maxNumSegments, segmentsToOptimize))
285                         {
286                                 
287                                 // Find the newest (rightmost) segment that needs to
288                                 // be optimized (other segments may have been flushed
289                                 // since optimize started):
290                                 int last = infos.Count;
291                                 while (last > 0)
292                                 {
293                                         SegmentInfo info = infos.Info(--last);
294                                         if (segmentsToOptimize.Contains(info))
295                                         {
296                                                 last++;
297                                                 break;
298                                         }
299                                 }
300                                 
301                                 if (last > 0)
302                                 {
303                                         
304                                         spec = new MergeSpecification();
305                                         
306                                         // First, enroll all "full" merges (size
307                                         // mergeFactor) to potentially be run concurrently:
308                                         while (last - maxNumSegments + 1 >= mergeFactor)
309                                         {
310                         spec.Add(MakeOneMerge(infos, infos.Range(last - mergeFactor, last)));
311                                                 last -= mergeFactor;
312                                         }
313                                         
314                                         // Only if there are no full merges pending do we
315                                         // add a final partial (< mergeFactor segments) merge:
316                                         if (0 == spec.merges.Count)
317                                         {
318                                                 if (maxNumSegments == 1)
319                                                 {
320                                                         
321                                                         // Since we must optimize down to 1 segment, the
322                                                         // choice is simple:
323                                                         if (last > 1 || !IsOptimized(infos.Info(0)))
324                                 spec.Add(MakeOneMerge(infos, infos.Range(0, last)));
325                                                 }
326                                                 else if (last > maxNumSegments)
327                                                 {
328                                                         
329                                                         // Take care to pick a partial merge that is
330                                                         // least cost, but does not make the index too
331                                                         // lopsided.  If we always just picked the
332                                                         // partial tail then we could produce a highly
333                                                         // lopsided index over time:
334                                                         
335                                                         // We must merge this many segments to leave
336                                                         // maxNumSegments in the index (from when
337                                                         // optimize was first kicked off):
338                                                         int finalMergeSize = last - maxNumSegments + 1;
339                                                         
340                                                         // Consider all possible starting points:
341                                                         long bestSize = 0;
342                                                         int bestStart = 0;
343                                                         
344                                                         for (int i = 0; i < last - finalMergeSize + 1; i++)
345                                                         {
346                                                                 long sumSize = 0;
347                                                                 for (int j = 0; j < finalMergeSize; j++)
348                                                                         sumSize += Size(infos.Info(j + i));
349                                                                 if (i == 0 || (sumSize < 2 * Size(infos.Info(i - 1)) && sumSize < bestSize))
350                                                                 {
351                                                                         bestStart = i;
352                                                                         bestSize = sumSize;
353                                                                 }
354                                                         }
355
356                             spec.Add(MakeOneMerge(infos, infos.Range(bestStart, bestStart + finalMergeSize)));
357                                                 }
358                                         }
359                                 }
360                                 else
361                                         spec = null;
362                         }
363                         else
364                                 spec = null;
365                         
366                         return spec;
367                 }
368                 
369                 /// <summary> Finds merges necessary to expunge all deletes from the
370                 /// index.  We simply merge adjacent segments that have
371                 /// deletes, up to mergeFactor at a time.
372                 /// </summary>
373                 public override MergeSpecification FindMergesToExpungeDeletes(SegmentInfos segmentInfos)
374                 {
375                         int numSegments = segmentInfos.Count;
376                         
377                         if (Verbose())
378                                 Message("findMergesToExpungeDeletes: " + numSegments + " segments");
379                         
380                         MergeSpecification spec = new MergeSpecification();
381                         int firstSegmentWithDeletions = - 1;
382                         for (int i = 0; i < numSegments; i++)
383                         {
384                                 SegmentInfo info = segmentInfos.Info(i);
385                                 int delCount = writer.NumDeletedDocs(info);
386                                 if (delCount > 0)
387                                 {
388                                         if (Verbose())
389                                                 Message("  segment " + info.name + " has deletions");
390                                         if (firstSegmentWithDeletions == - 1)
391                                                 firstSegmentWithDeletions = i;
392                                         else if (i - firstSegmentWithDeletions == mergeFactor)
393                                         {
394                                                 // We've seen mergeFactor segments in a row with
395                                                 // deletions, so force a merge now:
396                                                 if (Verbose())
397                                                         Message("  add merge " + firstSegmentWithDeletions + " to " + (i - 1) + " inclusive");
398                         spec.Add(MakeOneMerge(segmentInfos, segmentInfos.Range(firstSegmentWithDeletions, i)));
399                                                 firstSegmentWithDeletions = i;
400                                         }
401                                 }
402                                 else if (firstSegmentWithDeletions != - 1)
403                                 {
404                                         // End of a sequence of segments with deletions, so,
405                                         // merge those past segments even if it's fewer than
406                                         // mergeFactor segments
407                                         if (Verbose())
408                                                 Message("  add merge " + firstSegmentWithDeletions + " to " + (i - 1) + " inclusive");
409                     spec.Add(MakeOneMerge(segmentInfos, segmentInfos.Range(firstSegmentWithDeletions, i)));
410                                         firstSegmentWithDeletions = - 1;
411                                 }
412                         }
413                         
414                         if (firstSegmentWithDeletions != - 1)
415                         {
416                                 if (Verbose())
417                                         Message("  add merge " + firstSegmentWithDeletions + " to " + (numSegments - 1) + " inclusive");
418                 spec.Add(MakeOneMerge(segmentInfos, segmentInfos.Range(firstSegmentWithDeletions, numSegments)));
419                         }
420                         
421                         return spec;
422                 }
423                 
424                 /// <summary>Checks if any merges are now necessary and returns a
425                 /// {@link MergePolicy.MergeSpecification} if so.  A merge
426                 /// is necessary when there are more than {@link
427                 /// #setMergeFactor} segments at a given level.  When
428                 /// multiple levels have too many segments, this method
429                 /// will return multiple merges, allowing the {@link
430                 /// MergeScheduler} to use concurrency. 
431                 /// </summary>
432                 public override MergeSpecification FindMerges(SegmentInfos infos)
433                 {
434                         
435                         int numSegments = infos.Count;
436                         if (Verbose())
437                                 Message("findMerges: " + numSegments + " segments");
438                         
439                         // Compute levels, which is just log (base mergeFactor)
440                         // of the size of each segment
441                         float[] levels = new float[numSegments];
442                         float norm = (float) System.Math.Log(mergeFactor);
443                         
444                         for (int i = 0; i < numSegments; i++)
445                         {
446                                 SegmentInfo info = infos.Info(i);
447                                 long size = Size(info);
448                                 
449                                 // Floor tiny segments
450                                 if (size < 1)
451                                         size = 1;
452                                 levels[i] = (float) System.Math.Log(size) / norm;
453                         }
454                         
455                         float levelFloor;
456                         if (minMergeSize <= 0)
457                                 levelFloor = (float) 0.0;
458                         else
459                         {
460                                 levelFloor = (float) (System.Math.Log(minMergeSize) / norm);
461                         }
462                         
463                         // Now, we quantize the log values into levels.  The
464                         // first level is any segment whose log size is within
465                         // LEVEL_LOG_SPAN of the max size, or, who has such as
466                         // segment "to the right".  Then, we find the max of all
467                         // other segments and use that to define the next level
468                         // segment, etc.
469                         
470                         MergeSpecification spec = null;
471                         
472                         int start = 0;
473                         while (start < numSegments)
474                         {
475                                 
476                                 // Find max level of all segments not already
477                                 // quantized.
478                                 float maxLevel = levels[start];
479                                 for (int i = 1 + start; i < numSegments; i++)
480                                 {
481                                         float level = levels[i];
482                                         if (level > maxLevel)
483                                                 maxLevel = level;
484                                 }
485                                 
486                                 // Now search backwards for the rightmost segment that
487                                 // falls into this level:
488                                 float levelBottom;
489                                 if (maxLevel < levelFloor)
490                                 // All remaining segments fall into the min level
491                                         levelBottom = - 1.0F;
492                                 else
493                                 {
494                                         levelBottom = (float) (maxLevel - LEVEL_LOG_SPAN);
495                                         
496                                         // Force a boundary at the level floor
497                                         if (levelBottom < levelFloor && maxLevel >= levelFloor)
498                                                 levelBottom = levelFloor;
499                                 }
500                                 
501                                 int upto = numSegments - 1;
502                                 while (upto >= start)
503                                 {
504                                         if (levels[upto] >= levelBottom)
505                                         {
506                                                 break;
507                                         }
508                                         upto--;
509                                 }
510                                 if (Verbose())
511                                         Message("  level " + levelBottom + " to " + maxLevel + ": " + (1 + upto - start) + " segments");
512                                 
513                                 // Finally, record all merges that are viable at this level:
514                                 int end = start + mergeFactor;
515                                 while (end <= 1 + upto)
516                                 {
517                                         bool anyTooLarge = false;
518                                         for (int i = start; i < end; i++)
519                                         {
520                                                 SegmentInfo info = infos.Info(i);
521                                                 anyTooLarge |= (Size(info) >= maxMergeSize || SizeDocs(info) >= maxMergeDocs);
522                                         }
523                                         
524                                         if (!anyTooLarge)
525                                         {
526                                                 if (spec == null)
527                                                         spec = new MergeSpecification();
528                                                 if (Verbose())
529                                                         Message("    " + start + " to " + end + ": add this merge");
530                         spec.Add(MakeOneMerge(infos, infos.Range(start, end)));
531                                         }
532                                         else if (Verbose())
533                                                 Message("    " + start + " to " + end + ": contains segment over maxMergeSize or maxMergeDocs; skipping");
534                                         
535                                         start = end;
536                                         end = start + mergeFactor;
537                                 }
538                                 
539                                 start = 1 + upto;
540                         }
541                         
542                         return spec;
543                 }
544         
545         protected OneMerge MakeOneMerge(SegmentInfos infos, SegmentInfos infosToMerge)
546         {
547             bool doCFS;
548             if (!useCompoundFile)
549             {
550                 doCFS = false;
551             }
552             else if (noCFSRatio == 1.0)
553             {
554                 doCFS = true;
555             }
556             else
557             {
558                 long totSize = 0;
559                 for (int i = 0; i < infos.Count; i++)
560                 {
561                     totSize += Size(infos.Info(i));
562                 }
563                 long mergeSize = 0;
564                 for (int i = 0; i < infosToMerge.Count; i++)
565                 {
566                     mergeSize += Size(infosToMerge.Info(i));
567                 }
568
569                 doCFS = mergeSize <= noCFSRatio * totSize;
570             }
571
572             return new OneMerge(infosToMerge, doCFS);
573         }
574                 
575                 /// <summary><p/>Determines the largest segment (measured by
576                 /// document count) that may be merged with other segments.
577                 /// Small values (e.g., less than 10,000) are best for
578                 /// interactive indexing, as this limits the length of
579                 /// pauses while indexing to a few seconds.  Larger values
580                 /// are best for batched indexing and speedier
581                 /// searches.<p/>
582                 /// 
583                 /// <p/>The default value is {@link Integer#MAX_VALUE}.<p/>
584                 /// 
585                 /// <p/>The default merge policy ({@link
586                 /// LogByteSizeMergePolicy}) also allows you to set this
587                 /// limit by net size (in MB) of the segment, using {@link
588                 /// LogByteSizeMergePolicy#setMaxMergeMB}.<p/>
589                 /// </summary>
590                 public virtual void  SetMaxMergeDocs(int maxMergeDocs)
591                 {
592                         this.maxMergeDocs = maxMergeDocs;
593                 }
594                 
595                 /// <summary>Returns the largest segment (measured by document
596                 /// count) that may be merged with other segments.
597                 /// </summary>
598                 /// <seealso cref="setMaxMergeDocs">
599                 /// </seealso>
600                 public virtual int GetMaxMergeDocs()
601                 {
602                         return maxMergeDocs;
603                 }
604         }
605 }