mcs/tools/monkeydoc/Lucene.Net/Lucene.Net/Index/LogMergePolicy.cs

   1 /*
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  * http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 using System;
  19
  20 namespace Mono.Lucene.Net.Index
  21 {
  22
  23         /// <summary><p/>This class implements a {@link MergePolicy} that tries
  24         /// to merge segments into levels of exponentially
  25         /// increasing size, where each level has fewer segments than
  26         /// the value of the merge factor. Whenever extra segments
  27         /// (beyond the merge factor upper bound) are encountered,
  28         /// all segments within the level are merged. You can get or
  29         /// set the merge factor using {@link #GetMergeFactor()} and
  30         /// {@link #SetMergeFactor(int)} respectively.<p/>
  31         ///
  32         /// <p/>This class is abstract and requires a subclass to
  33         /// define the {@link #size} method which specifies how a
  34         /// segment's size is determined.  {@link LogDocMergePolicy}
  35         /// is one subclass that measures size by document count in
  36         /// the segment.  {@link LogByteSizeMergePolicy} is another
  37         /// subclass that measures size as the total byte size of the
  38         /// file(s) for the segment.<p/>
  39         /// </summary>
  40
  41         public abstract class LogMergePolicy:MergePolicy
  42         {
  43
  44                 /// <summary>Defines the allowed range of log(size) for each
  45                 /// level.  A level is computed by taking the max segment
  46                 /// log size, minus LEVEL_LOG_SPAN, and finding all
  47                 /// segments falling within that range.
  48                 /// </summary>
  49                 public const double LEVEL_LOG_SPAN = 0.75;
  50
  51                 /// <summary>Default merge factor, which is how many segments are
  52                 /// merged at a time
  53                 /// </summary>
  54                 public const int DEFAULT_MERGE_FACTOR = 10;
  55
  56                 /// <summary>Default maximum segment size.  A segment of this size</summary>
  57                 /// <seealso cref="setMaxMergeDocs">
  58                 /// </seealso>
  59                 public static readonly int DEFAULT_MAX_MERGE_DOCS = System.Int32.MaxValue;
  60
  61         /// <summary> Default noCFSRatio.  If a merge's size is >= 10% of
  62         ///  the index, then we disable compound file for it.
  63         ///  @see #setNoCFSRatio
  64         ///  </summary>
  65         public static double DEFAULT_NO_CFS_RATIO = 0.1;
  66
  67                 private int mergeFactor = DEFAULT_MERGE_FACTOR;
  68
  69                 internal long minMergeSize;
  70                 internal long maxMergeSize;
  71                 internal int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;
  72
  73         protected double noCFSRatio = DEFAULT_NO_CFS_RATIO;
  74
  75                 /* TODO 3.0: change this default to true */
  76                 protected internal bool calibrateSizeByDeletes = false;
  77
  78                 private bool useCompoundFile = true;
  79                 private bool useCompoundDocStore = true;
  80
  81                 public LogMergePolicy(IndexWriter writer):base(writer)
  82                 {
  83                 }
  84
  85                 protected internal virtual bool Verbose()
  86                 {
  87                         return writer != null && writer.Verbose();
  88                 }
  89
  90
  91         /** @see #setNoCFSRatio */
  92         public double GetNoCFSRatio()
  93         {
  94             return noCFSRatio;
  95         }
  96
  97         /** If a merged segment will be more than this percentage
  98          *  of the total size of the index, leave the segment as
  99          *  non-compound file even if compound file is enabled.
 100          *  Set to 1.0 to always use CFS regardless of merge
 101          *  size. */
 102         public void SetNoCFSRatio(double noCFSRatio)
 103         {
 104             if (noCFSRatio < 0.0 || noCFSRatio > 1.0)
 105             {
 106                 throw new ArgumentException("noCFSRatio must be 0.0 to 1.0 inclusive; got " + noCFSRatio);
 107             }
 108             this.noCFSRatio = noCFSRatio;
 109         }
 110
 111                 private void  Message(System.String message)
 112                 {
 113                         if (Verbose())
 114                                 writer.Message("LMP: " + message);
 115                 }
 116
 117                 /// <summary><p/>Returns the number of segments that are merged at
 118                 /// once and also controls the total number of segments
 119                 /// allowed to accumulate in the index.<p/>
 120                 /// </summary>
 121                 public virtual int GetMergeFactor()
 122                 {
 123                         return mergeFactor;
 124                 }
 125
 126                 /// <summary>Determines how often segment indices are merged by
 127                 /// addDocument().  With smaller values, less RAM is used
 128                 /// while indexing, and searches on unoptimized indices are
 129                 /// faster, but indexing speed is slower.  With larger
 130                 /// values, more RAM is used during indexing, and while
 131                 /// searches on unoptimized indices are slower, indexing is
 132         /// faster.  Thus larger values (&gt; 10) are best for batch
 133         /// index creation, and smaller values (&lt; 10) for indices
 134                 /// that are interactively maintained.
 135                 /// </summary>
 136                 public virtual void  SetMergeFactor(int mergeFactor)
 137                 {
 138                         if (mergeFactor < 2)
 139                                 throw new System.ArgumentException("mergeFactor cannot be less than 2");
 140                         this.mergeFactor = mergeFactor;
 141                 }
 142
 143                 // Javadoc inherited
 144                 public override bool UseCompoundFile(SegmentInfos infos, SegmentInfo info)
 145                 {
 146                         return useCompoundFile;
 147                 }
 148
 149                 /// <summary>Sets whether compound file format should be used for
 150                 /// newly flushed and newly merged segments.
 151                 /// </summary>
 152                 public virtual void  SetUseCompoundFile(bool useCompoundFile)
 153                 {
 154                         this.useCompoundFile = useCompoundFile;
 155                 }
 156
 157                 /// <summary>Returns true if newly flushed and newly merge segments</summary>
 158         /// <seealso cref="SetUseCompoundFile">
 159                 /// </seealso>
 160                 public virtual bool GetUseCompoundFile()
 161                 {
 162                         return useCompoundFile;
 163                 }
 164
 165                 // Javadoc inherited
 166                 public override bool UseCompoundDocStore(SegmentInfos infos)
 167                 {
 168                         return useCompoundDocStore;
 169                 }
 170
 171                 /// <summary>Sets whether compound file format should be used for
 172                 /// newly flushed and newly merged doc store
 173                 /// segment files (term vectors and stored fields).
 174                 /// </summary>
 175                 public virtual void  SetUseCompoundDocStore(bool useCompoundDocStore)
 176                 {
 177                         this.useCompoundDocStore = useCompoundDocStore;
 178                 }
 179
 180                 /// <summary>Returns true if newly flushed and newly merge doc
 181                 /// store segment files (term vectors and stored fields)
 182                 /// </summary>
 183         /// <seealso cref="SetUseCompoundDocStore ">
 184                 /// </seealso>
 185                 public virtual bool GetUseCompoundDocStore()
 186                 {
 187                         return useCompoundDocStore;
 188                 }
 189
 190                 /// <summary>Sets whether the segment size should be calibrated by
 191                 /// the number of deletes when choosing segments for merge.
 192                 /// </summary>
 193                 public virtual void  SetCalibrateSizeByDeletes(bool calibrateSizeByDeletes)
 194                 {
 195                         this.calibrateSizeByDeletes = calibrateSizeByDeletes;
 196                 }
 197
 198                 /// <summary>Returns true if the segment size should be calibrated
 199                 /// by the number of deletes when choosing segments for merge.
 200                 /// </summary>
 201                 public virtual bool GetCalibrateSizeByDeletes()
 202                 {
 203                         return calibrateSizeByDeletes;
 204                 }
 205
 206                 public override void  Close()
 207                 {
 208                 }
 209
 210                 abstract protected internal long Size(SegmentInfo info);
 211
 212                 protected internal virtual long SizeDocs(SegmentInfo info)
 213                 {
 214                         if (calibrateSizeByDeletes)
 215                         {
 216                                 int delCount = writer.NumDeletedDocs(info);
 217                                 return (info.docCount - (long) delCount);
 218                         }
 219                         else
 220                         {
 221                                 return info.docCount;
 222                         }
 223                 }
 224
 225                 protected internal virtual long SizeBytes(SegmentInfo info)
 226                 {
 227                         long byteSize = info.SizeInBytes();
 228                         if (calibrateSizeByDeletes)
 229                         {
 230                                 int delCount = writer.NumDeletedDocs(info);
 231                                 float delRatio = (info.docCount <= 0?0.0f:((float) delCount / (float) info.docCount));
 232                                 return (info.docCount <= 0?byteSize:(long) (byteSize * (1.0f - delRatio)));
 233                         }
 234                         else
 235                         {
 236                                 return byteSize;
 237                         }
 238                 }
 239
 240                 private bool IsOptimized(SegmentInfos infos, int maxNumSegments, System.Collections.Hashtable segmentsToOptimize)
 241                 {
 242                         int numSegments = infos.Count;
 243                         int numToOptimize = 0;
 244                         SegmentInfo optimizeInfo = null;
 245                         for (int i = 0; i < numSegments && numToOptimize <= maxNumSegments; i++)
 246                         {
 247                                 SegmentInfo info = infos.Info(i);
 248                                 if (segmentsToOptimize.Contains(info))
 249                                 {
 250                                         numToOptimize++;
 251                                         optimizeInfo = info;
 252                                 }
 253                         }
 254
 255                         return numToOptimize <= maxNumSegments && (numToOptimize != 1 || IsOptimized(optimizeInfo));
 256                 }
 257
 258                 /// <summary>Returns true if this single info is optimized (has no
 259                 /// pending norms or deletes, is in the same dir as the
 260                 /// writer, and matches the current compound file setting
 261                 /// </summary>
 262                 private bool IsOptimized(SegmentInfo info)
 263                 {
 264                         bool hasDeletions = writer.NumDeletedDocs(info) > 0;
 265                         return !hasDeletions && !info.HasSeparateNorms() && info.dir == writer.GetDirectory() &&
 266                 (info.GetUseCompoundFile() == useCompoundFile || noCFSRatio < 1.0);
 267                 }
 268
 269                 /// <summary>Returns the merges necessary to optimize the index.
 270                 /// This merge policy defines "optimized" to mean only one
 271                 /// segment in the index, where that segment has no
 272                 /// deletions pending nor separate norms, and it is in
 273                 /// compound file format if the current useCompoundFile
 274                 /// setting is true.  This method returns multiple merges
 275                 /// (mergeFactor at a time) so the {@link MergeScheduler}
 276                 /// in use may make use of concurrency.
 277                 /// </summary>
 278                 public override MergeSpecification FindMergesForOptimize(SegmentInfos infos, int maxNumSegments, System.Collections.Hashtable segmentsToOptimize)
 279                 {
 280                         MergeSpecification spec;
 281
 282                         System.Diagnostics.Debug.Assert(maxNumSegments > 0);
 283
 284                         if (!IsOptimized(infos, maxNumSegments, segmentsToOptimize))
 285                         {
 286
 287                                 // Find the newest (rightmost) segment that needs to
 288                                 // be optimized (other segments may have been flushed
 289                                 // since optimize started):
 290                                 int last = infos.Count;
 291                                 while (last > 0)
 292                                 {
 293                                         SegmentInfo info = infos.Info(--last);
 294                                         if (segmentsToOptimize.Contains(info))
 295                                         {
 296                                                 last++;
 297                                                 break;
 298                                         }
 299                                 }
 300
 301                                 if (last > 0)
 302                                 {
 303
 304                                         spec = new MergeSpecification();
 305
 306                                         // First, enroll all "full" merges (size
 307                                         // mergeFactor) to potentially be run concurrently:
 308                                         while (last - maxNumSegments + 1 >= mergeFactor)
 309                                         {
 310                         spec.Add(MakeOneMerge(infos, infos.Range(last - mergeFactor, last)));
 311                                                 last -= mergeFactor;
 312                                         }
 313
 314                                         // Only if there are no full merges pending do we
 315                                         // add a final partial (< mergeFactor segments) merge:
 316                                         if (0 == spec.merges.Count)
 317                                         {
 318                                                 if (maxNumSegments == 1)
 319                                                 {
 320
 321                                                         // Since we must optimize down to 1 segment, the
 322                                                         // choice is simple:
 323                                                         if (last > 1 || !IsOptimized(infos.Info(0)))
 324                                 spec.Add(MakeOneMerge(infos, infos.Range(0, last)));
 325                                                 }
 326                                                 else if (last > maxNumSegments)
 327                                                 {
 328
 329                                                         // Take care to pick a partial merge that is
 330                                                         // least cost, but does not make the index too
 331                                                         // lopsided.  If we always just picked the
 332                                                         // partial tail then we could produce a highly
 333                                                         // lopsided index over time:
 334
 335                                                         // We must merge this many segments to leave
 336                                                         // maxNumSegments in the index (from when
 337                                                         // optimize was first kicked off):
 338                                                         int finalMergeSize = last - maxNumSegments + 1;
 339
 340                                                         // Consider all possible starting points:
 341                                                         long bestSize = 0;
 342                                                         int bestStart = 0;
 343
 344                                                         for (int i = 0; i < last - finalMergeSize + 1; i++)
 345                                                         {
 346                                                                 long sumSize = 0;
 347                                                                 for (int j = 0; j < finalMergeSize; j++)
 348                                                                         sumSize += Size(infos.Info(j + i));
 349                                                                 if (i == 0 || (sumSize < 2 * Size(infos.Info(i - 1)) && sumSize < bestSize))
 350                                                                 {
 351                                                                         bestStart = i;
 352                                                                         bestSize = sumSize;
 353                                                                 }
 354                                                         }
 355
 356                             spec.Add(MakeOneMerge(infos, infos.Range(bestStart, bestStart + finalMergeSize)));
 357                                                 }
 358                                         }
 359                                 }
 360                                 else
 361                                         spec = null;
 362                         }
 363                         else
 364                                 spec = null;
 365
 366                         return spec;
 367                 }
 368
 369                 /// <summary> Finds merges necessary to expunge all deletes from the
 370                 /// index.  We simply merge adjacent segments that have
 371                 /// deletes, up to mergeFactor at a time.
 372                 /// </summary>
 373                 public override MergeSpecification FindMergesToExpungeDeletes(SegmentInfos segmentInfos)
 374                 {
 375                         int numSegments = segmentInfos.Count;
 376
 377                         if (Verbose())
 378                                 Message("findMergesToExpungeDeletes: " + numSegments + " segments");
 379
 380                         MergeSpecification spec = new MergeSpecification();
 381                         int firstSegmentWithDeletions = - 1;
 382                         for (int i = 0; i < numSegments; i++)
 383                         {
 384                                 SegmentInfo info = segmentInfos.Info(i);
 385                                 int delCount = writer.NumDeletedDocs(info);
 386                                 if (delCount > 0)
 387                                 {
 388                                         if (Verbose())
 389                                                 Message("  segment " + info.name + " has deletions");
 390                                         if (firstSegmentWithDeletions == - 1)
 391                                                 firstSegmentWithDeletions = i;
 392                                         else if (i - firstSegmentWithDeletions == mergeFactor)
 393                                         {
 394                                                 // We've seen mergeFactor segments in a row with
 395                                                 // deletions, so force a merge now:
 396                                                 if (Verbose())
 397                                                         Message("  add merge " + firstSegmentWithDeletions + " to " + (i - 1) + " inclusive");
 398                         spec.Add(MakeOneMerge(segmentInfos, segmentInfos.Range(firstSegmentWithDeletions, i)));
 399                                                 firstSegmentWithDeletions = i;
 400                                         }
 401                                 }
 402                                 else if (firstSegmentWithDeletions != - 1)
 403                                 {
 404                                         // End of a sequence of segments with deletions, so,
 405                                         // merge those past segments even if it's fewer than
 406                                         // mergeFactor segments
 407                                         if (Verbose())
 408                                                 Message("  add merge " + firstSegmentWithDeletions + " to " + (i - 1) + " inclusive");
 409                     spec.Add(MakeOneMerge(segmentInfos, segmentInfos.Range(firstSegmentWithDeletions, i)));
 410                                         firstSegmentWithDeletions = - 1;
 411                                 }
 412                         }
 413
 414                         if (firstSegmentWithDeletions != - 1)
 415                         {
 416                                 if (Verbose())
 417                                         Message("  add merge " + firstSegmentWithDeletions + " to " + (numSegments - 1) + " inclusive");
 418                 spec.Add(MakeOneMerge(segmentInfos, segmentInfos.Range(firstSegmentWithDeletions, numSegments)));
 419                         }
 420
 421                         return spec;
 422                 }
 423
 424                 /// <summary>Checks if any merges are now necessary and returns a
 425                 /// {@link MergePolicy.MergeSpecification} if so.  A merge
 426                 /// is necessary when there are more than {@link
 427                 /// #setMergeFactor} segments at a given level.  When
 428                 /// multiple levels have too many segments, this method
 429                 /// will return multiple merges, allowing the {@link
 430                 /// MergeScheduler} to use concurrency.
 431                 /// </summary>
 432                 public override MergeSpecification FindMerges(SegmentInfos infos)
 433                 {
 434
 435                         int numSegments = infos.Count;
 436                         if (Verbose())
 437                                 Message("findMerges: " + numSegments + " segments");
 438
 439                         // Compute levels, which is just log (base mergeFactor)
 440                         // of the size of each segment
 441                         float[] levels = new float[numSegments];
 442                         float norm = (float) System.Math.Log(mergeFactor);
 443
 444                         for (int i = 0; i < numSegments; i++)
 445                         {
 446                                 SegmentInfo info = infos.Info(i);
 447                                 long size = Size(info);
 448
 449                                 // Floor tiny segments
 450                                 if (size < 1)
 451                                         size = 1;
 452                                 levels[i] = (float) System.Math.Log(size) / norm;
 453                         }
 454
 455                         float levelFloor;
 456                         if (minMergeSize <= 0)
 457                                 levelFloor = (float) 0.0;
 458                         else
 459                         {
 460                                 levelFloor = (float) (System.Math.Log(minMergeSize) / norm);
 461                         }
 462
 463                         // Now, we quantize the log values into levels.  The
 464                         // first level is any segment whose log size is within
 465                         // LEVEL_LOG_SPAN of the max size, or, who has such as
 466                         // segment "to the right".  Then, we find the max of all
 467                         // other segments and use that to define the next level
 468                         // segment, etc.
 469
 470                         MergeSpecification spec = null;
 471
 472                         int start = 0;
 473                         while (start < numSegments)
 474                         {
 475
 476                                 // Find max level of all segments not already
 477                                 // quantized.
 478                                 float maxLevel = levels[start];
 479                                 for (int i = 1 + start; i < numSegments; i++)
 480                                 {
 481                                         float level = levels[i];
 482                                         if (level > maxLevel)
 483                                                 maxLevel = level;
 484                                 }
 485
 486                                 // Now search backwards for the rightmost segment that
 487                                 // falls into this level:
 488                                 float levelBottom;
 489                                 if (maxLevel < levelFloor)
 490                                 // All remaining segments fall into the min level
 491                                         levelBottom = - 1.0F;
 492                                 else
 493                                 {
 494                                         levelBottom = (float) (maxLevel - LEVEL_LOG_SPAN);
 495
 496                                         // Force a boundary at the level floor
 497                                         if (levelBottom < levelFloor && maxLevel >= levelFloor)
 498                                                 levelBottom = levelFloor;
 499                                 }
 500
 501                                 int upto = numSegments - 1;
 502                                 while (upto >= start)
 503                                 {
 504                                         if (levels[upto] >= levelBottom)
 505                                         {
 506                                                 break;
 507                                         }
 508                                         upto--;
 509                                 }
 510                                 if (Verbose())
 511                                         Message("  level " + levelBottom + " to " + maxLevel + ": " + (1 + upto - start) + " segments");
 512
 513                                 // Finally, record all merges that are viable at this level:
 514                                 int end = start + mergeFactor;
 515                                 while (end <= 1 + upto)
 516                                 {
 517                                         bool anyTooLarge = false;
 518                                         for (int i = start; i < end; i++)
 519                                         {
 520                                                 SegmentInfo info = infos.Info(i);
 521                                                 anyTooLarge |= (Size(info) >= maxMergeSize || SizeDocs(info) >= maxMergeDocs);
 522                                         }
 523
 524                                         if (!anyTooLarge)
 525                                         {
 526                                                 if (spec == null)
 527                                                         spec = new MergeSpecification();
 528                                                 if (Verbose())
 529                                                         Message("    " + start + " to " + end + ": add this merge");
 530                         spec.Add(MakeOneMerge(infos, infos.Range(start, end)));
 531                                         }
 532                                         else if (Verbose())
 533                                                 Message("    " + start + " to " + end + ": contains segment over maxMergeSize or maxMergeDocs; skipping");
 534
 535                                         start = end;
 536                                         end = start + mergeFactor;
 537                                 }
 538
 539                                 start = 1 + upto;
 540                         }
 541
 542                         return spec;
 543                 }
 544
 545         protected OneMerge MakeOneMerge(SegmentInfos infos, SegmentInfos infosToMerge)
 546         {
 547             bool doCFS;
 548             if (!useCompoundFile)
 549             {
 550                 doCFS = false;
 551             }
 552             else if (noCFSRatio == 1.0)
 553             {
 554                 doCFS = true;
 555             }
 556             else
 557             {
 558                 long totSize = 0;
 559                 for (int i = 0; i < infos.Count; i++)
 560                 {
 561                     totSize += Size(infos.Info(i));
 562                 }
 563                 long mergeSize = 0;
 564                 for (int i = 0; i < infosToMerge.Count; i++)
 565                 {
 566                     mergeSize += Size(infosToMerge.Info(i));
 567                 }
 568
 569                 doCFS = mergeSize <= noCFSRatio * totSize;
 570             }
 571
 572             return new OneMerge(infosToMerge, doCFS);
 573         }
 574
 575                 /// <summary><p/>Determines the largest segment (measured by
 576                 /// document count) that may be merged with other segments.
 577                 /// Small values (e.g., less than 10,000) are best for
 578                 /// interactive indexing, as this limits the length of
 579                 /// pauses while indexing to a few seconds.  Larger values
 580                 /// are best for batched indexing and speedier
 581                 /// searches.<p/>
 582                 ///
 583                 /// <p/>The default value is {@link Integer#MAX_VALUE}.<p/>
 584                 ///
 585                 /// <p/>The default merge policy ({@link
 586                 /// LogByteSizeMergePolicy}) also allows you to set this
 587                 /// limit by net size (in MB) of the segment, using {@link
 588                 /// LogByteSizeMergePolicy#setMaxMergeMB}.<p/>
 589                 /// </summary>
 590                 public virtual void  SetMaxMergeDocs(int maxMergeDocs)
 591                 {
 592                         this.maxMergeDocs = maxMergeDocs;
 593                 }
 594
 595                 /// <summary>Returns the largest segment (measured by document
 596                 /// count) that may be merged with other segments.
 597                 /// </summary>
 598                 /// <seealso cref="setMaxMergeDocs">
 599                 /// </seealso>
 600                 public virtual int GetMaxMergeDocs()
 601                 {
 602                         return maxMergeDocs;
 603                 }
 604         }
 605 }