2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
20 namespace Mono.Lucene.Net.Index
23 /// <summary><p/>This class implements a {@link MergePolicy} that tries
24 /// to merge segments into levels of exponentially
25 /// increasing size, where each level has fewer segments than
26 /// the value of the merge factor. Whenever extra segments
27 /// (beyond the merge factor upper bound) are encountered,
28 /// all segments within the level are merged. You can get or
29 /// set the merge factor using {@link #GetMergeFactor()} and
30 /// {@link #SetMergeFactor(int)} respectively.<p/>
32 /// <p/>This class is abstract and requires a subclass to
33 /// define the {@link #size} method which specifies how a
34 /// segment's size is determined. {@link LogDocMergePolicy}
35 /// is one subclass that measures size by document count in
36 /// the segment. {@link LogByteSizeMergePolicy} is another
37 /// subclass that measures size as the total byte size of the
38 /// file(s) for the segment.<p/>
41 public abstract class LogMergePolicy:MergePolicy
44 /// <summary>Defines the allowed range of log(size) for each
45 /// level. A level is computed by taking the max segment
46 /// log size, minus LEVEL_LOG_SPAN, and finding all
47 /// segments falling within that range.
49 public const double LEVEL_LOG_SPAN = 0.75;
51 /// <summary>Default merge factor, which is how many segments are
54 public const int DEFAULT_MERGE_FACTOR = 10;
56 /// <summary>Default maximum segment size. A segment of this size</summary>
57 /// <seealso cref="setMaxMergeDocs">
59 public static readonly int DEFAULT_MAX_MERGE_DOCS = System.Int32.MaxValue;
61 /// <summary> Default noCFSRatio. If a merge's size is >= 10% of
62 /// the index, then we disable compound file for it.
63 /// @see #setNoCFSRatio
65 public static double DEFAULT_NO_CFS_RATIO = 0.1;
67 private int mergeFactor = DEFAULT_MERGE_FACTOR;
69 internal long minMergeSize;
70 internal long maxMergeSize;
71 internal int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
73 protected double noCFSRatio = DEFAULT_NO_CFS_RATIO;
75 /* TODO 3.0: change this default to true */
76 protected internal bool calibrateSizeByDeletes = false;
78 private bool useCompoundFile = true;
79 private bool useCompoundDocStore = true;
81 public LogMergePolicy(IndexWriter writer):base(writer)
85 protected internal virtual bool Verbose()
87 return writer != null && writer.Verbose();
91 /** @see #setNoCFSRatio */
92 public double GetNoCFSRatio()
97 /** If a merged segment will be more than this percentage
98 * of the total size of the index, leave the segment as
99 * non-compound file even if compound file is enabled.
100 * Set to 1.0 to always use CFS regardless of merge
102 public void SetNoCFSRatio(double noCFSRatio)
104 if (noCFSRatio < 0.0 || noCFSRatio > 1.0)
106 throw new ArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + noCFSRatio);
108 this.noCFSRatio = noCFSRatio;
111 private void Message(System.String message)
114 writer.Message("LMP: " + message);
117 /// <summary><p/>Returns the number of segments that are merged at
118 /// once and also controls the total number of segments
119 /// allowed to accumulate in the index.<p/>
121 public virtual int GetMergeFactor()
126 /// <summary>Determines how often segment indices are merged by
127 /// addDocument(). With smaller values, less RAM is used
128 /// while indexing, and searches on unoptimized indices are
129 /// faster, but indexing speed is slower. With larger
130 /// values, more RAM is used during indexing, and while
131 /// searches on unoptimized indices are slower, indexing is
132 /// faster. Thus larger values (> 10) are best for batch
133 /// index creation, and smaller values (< 10) for indices
134 /// that are interactively maintained.
136 public virtual void SetMergeFactor(int mergeFactor)
139 throw new System.ArgumentException("mergeFactor cannot be less than 2");
140 this.mergeFactor = mergeFactor;
144 public override bool UseCompoundFile(SegmentInfos infos, SegmentInfo info)
146 return useCompoundFile;
149 /// <summary>Sets whether compound file format should be used for
150 /// newly flushed and newly merged segments.
152 public virtual void SetUseCompoundFile(bool useCompoundFile)
154 this.useCompoundFile = useCompoundFile;
157 /// <summary>Returns true if newly flushed and newly merge segments</summary>
158 /// <seealso cref="SetUseCompoundFile">
160 public virtual bool GetUseCompoundFile()
162 return useCompoundFile;
166 public override bool UseCompoundDocStore(SegmentInfos infos)
168 return useCompoundDocStore;
171 /// <summary>Sets whether compound file format should be used for
172 /// newly flushed and newly merged doc store
173 /// segment files (term vectors and stored fields).
175 public virtual void SetUseCompoundDocStore(bool useCompoundDocStore)
177 this.useCompoundDocStore = useCompoundDocStore;
180 /// <summary>Returns true if newly flushed and newly merge doc
181 /// store segment files (term vectors and stored fields)
183 /// <seealso cref="SetUseCompoundDocStore ">
185 public virtual bool GetUseCompoundDocStore()
187 return useCompoundDocStore;
190 /// <summary>Sets whether the segment size should be calibrated by
191 /// the number of deletes when choosing segments for merge.
193 public virtual void SetCalibrateSizeByDeletes(bool calibrateSizeByDeletes)
195 this.calibrateSizeByDeletes = calibrateSizeByDeletes;
198 /// <summary>Returns true if the segment size should be calibrated
199 /// by the number of deletes when choosing segments for merge.
201 public virtual bool GetCalibrateSizeByDeletes()
203 return calibrateSizeByDeletes;
206 public override void Close()
210 abstract protected internal long Size(SegmentInfo info);
212 protected internal virtual long SizeDocs(SegmentInfo info)
214 if (calibrateSizeByDeletes)
216 int delCount = writer.NumDeletedDocs(info);
217 return (info.docCount - (long) delCount);
221 return info.docCount;
225 protected internal virtual long SizeBytes(SegmentInfo info)
227 long byteSize = info.SizeInBytes();
228 if (calibrateSizeByDeletes)
230 int delCount = writer.NumDeletedDocs(info);
231 float delRatio = (info.docCount <= 0?0.0f:((float) delCount / (float) info.docCount));
232 return (info.docCount <= 0?byteSize:(long) (byteSize * (1.0f - delRatio)));
240 private bool IsOptimized(SegmentInfos infos, int maxNumSegments, System.Collections.Hashtable segmentsToOptimize)
242 int numSegments = infos.Count;
243 int numToOptimize = 0;
244 SegmentInfo optimizeInfo = null;
245 for (int i = 0; i < numSegments && numToOptimize <= maxNumSegments; i++)
247 SegmentInfo info = infos.Info(i);
248 if (segmentsToOptimize.Contains(info))
255 return numToOptimize <= maxNumSegments && (numToOptimize != 1 || IsOptimized(optimizeInfo));
258 /// <summary>Returns true if this single info is optimized (has no
259 /// pending norms or deletes, is in the same dir as the
260 /// writer, and matches the current compound file setting
262 private bool IsOptimized(SegmentInfo info)
264 bool hasDeletions = writer.NumDeletedDocs(info) > 0;
265 return !hasDeletions && !info.HasSeparateNorms() && info.dir == writer.GetDirectory() &&
266 (info.GetUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0);
269 /// <summary>Returns the merges necessary to optimize the index.
270 /// This merge policy defines "optimized" to mean only one
271 /// segment in the index, where that segment has no
272 /// deletions pending nor separate norms, and it is in
273 /// compound file format if the current useCompoundFile
274 /// setting is true. This method returns multiple merges
275 /// (mergeFactor at a time) so the {@link MergeScheduler}
276 /// in use may make use of concurrency.
278 public override MergeSpecification FindMergesForOptimize(SegmentInfos infos, int maxNumSegments, System.Collections.Hashtable segmentsToOptimize)
280 MergeSpecification spec;
282 System.Diagnostics.Debug.Assert(maxNumSegments > 0);
284 if (!IsOptimized(infos, maxNumSegments, segmentsToOptimize))
287 // Find the newest (rightmost) segment that needs to
288 // be optimized (other segments may have been flushed
289 // since optimize started):
290 int last = infos.Count;
293 SegmentInfo info = infos.Info(--last);
294 if (segmentsToOptimize.Contains(info))
304 spec = new MergeSpecification();
306 // First, enroll all "full" merges (size
307 // mergeFactor) to potentially be run concurrently:
308 while (last - maxNumSegments + 1 >= mergeFactor)
310 spec.Add(MakeOneMerge(infos, infos.Range(last - mergeFactor, last)));
314 // Only if there are no full merges pending do we
315 // add a final partial (< mergeFactor segments) merge:
316 if (0 == spec.merges.Count)
318 if (maxNumSegments == 1)
321 // Since we must optimize down to 1 segment, the
323 if (last > 1 || !IsOptimized(infos.Info(0)))
324 spec.Add(MakeOneMerge(infos, infos.Range(0, last)));
326 else if (last > maxNumSegments)
329 // Take care to pick a partial merge that is
330 // least cost, but does not make the index too
331 // lopsided. If we always just picked the
332 // partial tail then we could produce a highly
333 // lopsided index over time:
335 // We must merge this many segments to leave
336 // maxNumSegments in the index (from when
337 // optimize was first kicked off):
338 int finalMergeSize = last - maxNumSegments + 1;
340 // Consider all possible starting points:
344 for (int i = 0; i < last - finalMergeSize + 1; i++)
347 for (int j = 0; j < finalMergeSize; j++)
348 sumSize += Size(infos.Info(j + i));
349 if (i == 0 || (sumSize < 2 * Size(infos.Info(i - 1)) && sumSize < bestSize))
356 spec.Add(MakeOneMerge(infos, infos.Range(bestStart, bestStart + finalMergeSize)));
369 /// <summary> Finds merges necessary to expunge all deletes from the
370 /// index. We simply merge adjacent segments that have
371 /// deletes, up to mergeFactor at a time.
373 public override MergeSpecification FindMergesToExpungeDeletes(SegmentInfos segmentInfos)
375 int numSegments = segmentInfos.Count;
378 Message("findMergesToExpungeDeletes: " + numSegments + " segments");
380 MergeSpecification spec = new MergeSpecification();
381 int firstSegmentWithDeletions = - 1;
382 for (int i = 0; i < numSegments; i++)
384 SegmentInfo info = segmentInfos.Info(i);
385 int delCount = writer.NumDeletedDocs(info);
389 Message(" segment " + info.name + " has deletions");
390 if (firstSegmentWithDeletions == - 1)
391 firstSegmentWithDeletions = i;
392 else if (i - firstSegmentWithDeletions == mergeFactor)
394 // We've seen mergeFactor segments in a row with
395 // deletions, so force a merge now:
397 Message(" add merge " + firstSegmentWithDeletions + " to " + (i - 1) + " inclusive");
398 spec.Add(MakeOneMerge(segmentInfos, segmentInfos.Range(firstSegmentWithDeletions, i)));
399 firstSegmentWithDeletions = i;
402 else if (firstSegmentWithDeletions != - 1)
404 // End of a sequence of segments with deletions, so,
405 // merge those past segments even if it's fewer than
406 // mergeFactor segments
408 Message(" add merge " + firstSegmentWithDeletions + " to " + (i - 1) + " inclusive");
409 spec.Add(MakeOneMerge(segmentInfos, segmentInfos.Range(firstSegmentWithDeletions, i)));
410 firstSegmentWithDeletions = - 1;
414 if (firstSegmentWithDeletions != - 1)
417 Message(" add merge " + firstSegmentWithDeletions + " to " + (numSegments - 1) + " inclusive");
418 spec.Add(MakeOneMerge(segmentInfos, segmentInfos.Range(firstSegmentWithDeletions, numSegments)));
424 /// <summary>Checks if any merges are now necessary and returns a
425 /// {@link MergePolicy.MergeSpecification} if so. A merge
426 /// is necessary when there are more than {@link
427 /// #setMergeFactor} segments at a given level. When
428 /// multiple levels have too many segments, this method
429 /// will return multiple merges, allowing the {@link
430 /// MergeScheduler} to use concurrency.
432 public override MergeSpecification FindMerges(SegmentInfos infos)
435 int numSegments = infos.Count;
437 Message("findMerges: " + numSegments + " segments");
439 // Compute levels, which is just log (base mergeFactor)
440 // of the size of each segment
441 float[] levels = new float[numSegments];
442 float norm = (float) System.Math.Log(mergeFactor);
444 for (int i = 0; i < numSegments; i++)
446 SegmentInfo info = infos.Info(i);
447 long size = Size(info);
449 // Floor tiny segments
452 levels[i] = (float) System.Math.Log(size) / norm;
456 if (minMergeSize <= 0)
457 levelFloor = (float) 0.0;
460 levelFloor = (float) (System.Math.Log(minMergeSize) / norm);
463 // Now, we quantize the log values into levels. The
464 // first level is any segment whose log size is within
465 // LEVEL_LOG_SPAN of the max size, or, who has such as
466 // segment "to the right". Then, we find the max of all
467 // other segments and use that to define the next level
470 MergeSpecification spec = null;
473 while (start < numSegments)
476 // Find max level of all segments not already
478 float maxLevel = levels[start];
479 for (int i = 1 + start; i < numSegments; i++)
481 float level = levels[i];
482 if (level > maxLevel)
486 // Now search backwards for the rightmost segment that
487 // falls into this level:
489 if (maxLevel < levelFloor)
490 // All remaining segments fall into the min level
491 levelBottom = - 1.0F;
494 levelBottom = (float) (maxLevel - LEVEL_LOG_SPAN);
496 // Force a boundary at the level floor
497 if (levelBottom < levelFloor && maxLevel >= levelFloor)
498 levelBottom = levelFloor;
501 int upto = numSegments - 1;
502 while (upto >= start)
504 if (levels[upto] >= levelBottom)
511 Message(" level " + levelBottom + " to " + maxLevel + ": " + (1 + upto - start) + " segments");
513 // Finally, record all merges that are viable at this level:
514 int end = start + mergeFactor;
515 while (end <= 1 + upto)
517 bool anyTooLarge = false;
518 for (int i = start; i < end; i++)
520 SegmentInfo info = infos.Info(i);
521 anyTooLarge |= (Size(info) >= maxMergeSize || SizeDocs(info) >= maxMergeDocs);
527 spec = new MergeSpecification();
529 Message(" " + start + " to " + end + ": add this merge");
530 spec.Add(MakeOneMerge(infos, infos.Range(start, end)));
533 Message(" " + start + " to " + end + ": contains segment over maxMergeSize or maxMergeDocs; skipping");
536 end = start + mergeFactor;
545 protected OneMerge MakeOneMerge(SegmentInfos infos, SegmentInfos infosToMerge)
548 if (!useCompoundFile)
552 else if (noCFSRatio == 1.0)
559 for (int i = 0; i < infos.Count; i++)
561 totSize += Size(infos.Info(i));
564 for (int i = 0; i < infosToMerge.Count; i++)
566 mergeSize += Size(infosToMerge.Info(i));
569 doCFS = mergeSize <= noCFSRatio * totSize;
572 return new OneMerge(infosToMerge, doCFS);
575 /// <summary><p/>Determines the largest segment (measured by
576 /// document count) that may be merged with other segments.
577 /// Small values (e.g., less than 10,000) are best for
578 /// interactive indexing, as this limits the length of
579 /// pauses while indexing to a few seconds. Larger values
580 /// are best for batched indexing and speedier
583 /// <p/>The default value is {@link Integer#MAX_VALUE}.<p/>
585 /// <p/>The default merge policy ({@link
586 /// LogByteSizeMergePolicy}) also allows you to set this
587 /// limit by net size (in MB) of the segment, using {@link
588 /// LogByteSizeMergePolicy#setMaxMergeMB}.<p/>
590 public virtual void SetMaxMergeDocs(int maxMergeDocs)
592 this.maxMergeDocs = maxMergeDocs;
595 /// <summary>Returns the largest segment (measured by document
596 /// count) that may be merged with other segments.
598 /// <seealso cref="setMaxMergeDocs">
600 public virtual int GetMaxMergeDocs()