IndexWriter.java (Apache Lucene 1.4.3)

File	Doc	Category	Size	Date	Package
IndexWriter.java	API Doc	Apache Lucene 1.4.3	21628	Thu Jun 24 10:05:56 BST 2004	org.apache.lucene.index
IndexWriter.java

package org.apache.lucene.index;

/**
 * Copyright 2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.File;
import java.io.PrintStream;
import java.util.Vector;

import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Lock;
import org.apache.lucene.store.InputStream;
import org.apache.lucene.store.OutputStream;
import org.apache.lucene.search.Similarity;
import org.apache.lucene.document.Document;
import org.apache.lucene.analysis.Analyzer;


/**
  An IndexWriter creates and maintains an index.

  The third argument to the <a href="#IndexWriter"><b>constructor</b></a>
  determines whether a new index is created, or whether an existing index is
  opened for the addition of new documents.

  In either case, documents are added with the <a
  href="#addDocument"><b>addDocument</b></a> method.  When finished adding
  documents, <a href="#close"><b>close</b></a> should be called.

  If an index will not have more documents added for a while and optimal search
  performance is desired, then the <a href="#optimize"><b>optimize</b></a>
  method should be called before the index is closed.
  */

public class IndexWriter {

  /**
   * Default value is 1000.  Use <code>org.apache.lucene.writeLockTimeout</code>
   * system property to override.
   */
  public static long WRITE_LOCK_TIMEOUT =
    Integer.parseInt(System.getProperty("org.apache.lucene.writeLockTimeout",
      "1000"));

  /**
   * Default value is 10000.  Use <code>org.apache.lucene.commitLockTimeout</code>
   * system property to override.
   */
  public static long COMMIT_LOCK_TIMEOUT =
    Integer.parseInt(System.getProperty("org.apache.lucene.commitLockTimeout",
      "10000"));

  public static final String WRITE_LOCK_NAME = "write.lock";
  public static final String COMMIT_LOCK_NAME = "commit.lock";

  /**
   * Default value is 10.  Use <code>org.apache.lucene.mergeFactor</code>
   * system property to override.
   */
  public static final int DEFAULT_MERGE_FACTOR =
    Integer.parseInt(System.getProperty("org.apache.lucene.mergeFactor",
      "10"));

  /**
   * Default value is 10.  Use <code>org.apache.lucene.minMergeDocs</code>
   * system property to override.
   */
  public static final int DEFAULT_MIN_MERGE_DOCS =
    Integer.parseInt(System.getProperty("org.apache.lucene.minMergeDocs",
      "10"));

  /**
   * Default value is {@link Integer#MAX_VALUE}.
   * Use <code>org.apache.lucene.maxMergeDocs</code> system property to override.
   */
  public static final int DEFAULT_MAX_MERGE_DOCS =
    Integer.parseInt(System.getProperty("org.apache.lucene.maxMergeDocs",
      String.valueOf(Integer.MAX_VALUE)));

  /**
   * Default value is 10000.  Use <code>org.apache.lucene.maxFieldLength</code>
   * system property to override.
   */
  public static final int DEFAULT_MAX_FIELD_LENGTH =
    Integer.parseInt(System.getProperty("org.apache.lucene.maxFieldLength",
      "10000"));


  private Directory directory;  // where this index resides
  private Analyzer analyzer;    // how to analyze text

  private Similarity similarity = Similarity.getDefault(); // how to normalize

  private SegmentInfos segmentInfos = new SegmentInfos(); // the segments
  private final Directory ramDirectory = new RAMDirectory(); // for temp segs

  private Lock writeLock;

  /** Use compound file setting. Defaults to true, minimizing the number of
   * files used.  Setting this to false may improve indexing performance, but
   * may also cause file handle problems.
   */
  private boolean useCompoundFile = true;
  
  private boolean closeDir;

  /** Setting to turn on usage of a compound file. When on, multiple files
   *  for each segment are merged into a single file once the segment creation
   *  is finished. This is done regardless of what directory is in use.
   */
  public boolean getUseCompoundFile() {
    return useCompoundFile;
  }

  /** Setting to turn on usage of a compound file. When on, multiple files
   *  for each segment are merged into a single file once the segment creation
   *  is finished. This is done regardless of what directory is in use.
   */
  public void setUseCompoundFile(boolean value) {
    useCompoundFile = value;
  }


    /** Expert: Set the Similarity implementation used by this IndexWriter.
   *
   * @see Similarity#setDefault(Similarity)
   */
  public void setSimilarity(Similarity similarity) {
    this.similarity = similarity;
  }

  /** Expert: Return the Similarity implementation used by this IndexWriter.
   *
   * <p>This defaults to the current value of {@link Similarity#getDefault()}.
   */
  public Similarity getSimilarity() {
    return this.similarity;
  }

  /**
   * Constructs an IndexWriter for the index in <code>path</code>.
   * Text will be analyzed with <code>a</code>.  If <code>create</code>
   * is true, then a new, empty index will be created in
   * <code>path</code>, replacing the index already there, if any.
   *
   * @param path the path to the index directory
   * @param a the analyzer to use
   * @param create <code>true</code> to create the index or overwrite
   *  the existing one; <code>false</code> to append to the existing
   *  index
   * @throws IOException if the directory cannot be read/written to, or
   *  if it does not exist, and <code>create</code> is
   *  <code>false</code>
   */
  public IndexWriter(String path, Analyzer a, boolean create)
       throws IOException {
    this(FSDirectory.getDirectory(path, create), a, create, true);
  }

  /**
   * Constructs an IndexWriter for the index in <code>path</code>.
   * Text will be analyzed with <code>a</code>.  If <code>create</code>
   * is true, then a new, empty index will be created in
   * <code>path</code>, replacing the index already there, if any.
   *
   * @param path the path to the index directory
   * @param a the analyzer to use
   * @param create <code>true</code> to create the index or overwrite
   *  the existing one; <code>false</code> to append to the existing
   *  index
   * @throws IOException if the directory cannot be read/written to, or
   *  if it does not exist, and <code>create</code> is
   *  <code>false</code>
   */
  public IndexWriter(File path, Analyzer a, boolean create)
       throws IOException {
    this(FSDirectory.getDirectory(path, create), a, create, true);
  }

  /**
   * Constructs an IndexWriter for the index in <code>d</code>.
   * Text will be analyzed with <code>a</code>.  If <code>create</code>
   * is true, then a new, empty index will be created in
   * <code>d</code>, replacing the index already there, if any.
   *
   * @param d the index directory
   * @param a the analyzer to use
   * @param create <code>true</code> to create the index or overwrite
   *  the existing one; <code>false</code> to append to the existing
   *  index
   * @throws IOException if the directory cannot be read/written to, or
   *  if it does not exist, and <code>create</code> is
   *  <code>false</code>
   */
  public IndexWriter(Directory d, Analyzer a, boolean create)
       throws IOException {
    this(d, a, create, false);
  }
  
  private IndexWriter(Directory d, Analyzer a, final boolean create, boolean closeDir)
    throws IOException {
      this.closeDir = closeDir;
      directory = d;
      analyzer = a;

      Lock writeLock = directory.makeLock(IndexWriter.WRITE_LOCK_NAME);
      if (!writeLock.obtain(WRITE_LOCK_TIMEOUT)) // obtain write lock
        throw new IOException("Index locked for write: " + writeLock);
      this.writeLock = writeLock;                   // save it

      synchronized (directory) {        // in- & inter-process sync
        new Lock.With(directory.makeLock(IndexWriter.COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT) {
            public Object doBody() throws IOException {
              if (create)
                segmentInfos.write(directory);
              else
                segmentInfos.read(directory);
              return null;
            }
          }.run();
      }
  }

  /** Flushes all changes to an index and closes all associated files. */
  public synchronized void close() throws IOException {
    flushRamSegments();
    ramDirectory.close();
    writeLock.release();                          // release write lock
    writeLock = null;
    if(closeDir)
      directory.close();
  }

  /** Release the write lock, if needed. */
  protected void finalize() throws IOException {
    if (writeLock != null) {
      writeLock.release();                        // release write lock
      writeLock = null;
    }
  }

  /** Returns the analyzer used by this index. */
  public Analyzer getAnalyzer() {
      return analyzer;
  }


  /** Returns the number of documents currently in this index. */
  public synchronized int docCount() {
    int count = 0;
    for (int i = 0; i < segmentInfos.size(); i++) {
      SegmentInfo si = segmentInfos.info(i);
      count += si.docCount;
    }
    return count;
  }

  /**
   * The maximum number of terms that will be indexed for a single field in a
   * document.  This limits the amount of memory required for indexing, so that
   * collections with very large files will not crash the indexing process by
   * running out of memory.<p/>
   * Note that this effectively truncates large documents, excluding from the
   * index terms that occur further in the document.  If you know your source
   * documents are large, be sure to set this value high enough to accomodate
   * the expected size.  If you set it to Integer.MAX_VALUE, then the only limit
   * is your memory, but you should anticipate an OutOfMemoryError.<p/>
   * By default, no more than 10,000 terms will be indexed for a field.
  */
  public int maxFieldLength = DEFAULT_MAX_FIELD_LENGTH;

  /**
   * Adds a document to this index.  If the document contains more than
   * {@link #maxFieldLength} terms for a given field, the remainder are
   * discarded.
   */
  public void addDocument(Document doc) throws IOException {
    addDocument(doc, analyzer);
  }

  /**
   * Adds a document to this index, using the provided analyzer instead of the
   * value of {@link #getAnalyzer()}.  If the document contains more than
   * {@link #maxFieldLength} terms for a given field, the remainder are
   * discarded.
   */
  public void addDocument(Document doc, Analyzer analyzer) throws IOException {
    DocumentWriter dw =
      new DocumentWriter(ramDirectory, analyzer, similarity, maxFieldLength);
    String segmentName = newSegmentName();
    dw.addDocument(segmentName, doc);
    synchronized (this) {
      segmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory));
      maybeMergeSegments();
    }
  }

  final int getSegmentsCounter(){
    return segmentInfos.counter;
  }
  
  private final synchronized String newSegmentName() {
    return "_" + Integer.toString(segmentInfos.counter++, Character.MAX_RADIX);
  }

  /** Determines how often segment indices are merged by addDocument().  With
   * smaller values, less RAM is used while indexing, and searches on
   * unoptimized indices are faster, but indexing speed is slower.  With larger
   * values, more RAM is used during indexing, and while searches on unoptimized
   * indices are slower, indexing is faster.  Thus larger values (> 10) are best
   * for batch index creation, and smaller values (< 10) for indices that are
   * interactively maintained.
   *
   * <p>This must never be less than 2.  The default value is 10.*/
  public int mergeFactor = DEFAULT_MERGE_FACTOR;

  /** Determines the minimal number of documents required before the buffered
   * in-memory documents are merging and a new Segment is created.
   * Since Documents are merged in a {@link org.apache.lucene.store.RAMDirectory},
   * large value gives faster indexing.  At the same time, mergeFactor limits
   * the number of files open in a FSDirectory.
   *
   * <p> The default value is 10.*/
  public int minMergeDocs = DEFAULT_MIN_MERGE_DOCS;


  /** Determines the largest number of documents ever merged by addDocument().
   * Small values (e.g., less than 10,000) are best for interactive indexing,
   * as this limits the length of pauses while indexing to a few seconds.
   * Larger values are best for batched indexing and speedier searches.
   *
   * <p>The default value is {@link Integer#MAX_VALUE}. */
  public int maxMergeDocs = DEFAULT_MAX_MERGE_DOCS;

  /** If non-null, information about merges will be printed to this. */
  public PrintStream infoStream = null;

  /** Merges all segments together into a single segment, optimizing an index
      for search. */
  public synchronized void optimize() throws IOException {
    flushRamSegments();
    while (segmentInfos.size() > 1 ||
           (segmentInfos.size() == 1 &&
            (SegmentReader.hasDeletions(segmentInfos.info(0)) ||
             segmentInfos.info(0).dir != directory ||
             (useCompoundFile &&
              (!SegmentReader.usesCompoundFile(segmentInfos.info(0)) ||
                SegmentReader.hasSeparateNorms(segmentInfos.info(0))))))) {
      int minSegment = segmentInfos.size() - mergeFactor;
      mergeSegments(minSegment < 0 ? 0 : minSegment);
    }
  }

  /** Merges all segments from an array of indexes into this index.
   *
   * <p>This may be used to parallelize batch indexing.  A large document
   * collection can be broken into sub-collections.  Each sub-collection can be
   * indexed in parallel, on a different thread, process or machine.  The
   * complete index can then be created by merging sub-collection indexes
   * with this method.
   *
   * <p>After this completes, the index is optimized. */
  public synchronized void addIndexes(Directory[] dirs)
      throws IOException {
    optimize();					  // start with zero or 1 seg
    for (int i = 0; i < dirs.length; i++) {
      SegmentInfos sis = new SegmentInfos();	  // read infos from dir
      sis.read(dirs[i]);
      for (int j = 0; j < sis.size(); j++) {
        segmentInfos.addElement(sis.info(j));	  // add each info
      }
    }
    optimize();					  // final cleanup
  }

  /** Merges the provided indexes into this index.
   * <p>After this completes, the index is optimized. </p>
   * <p>The provided IndexReaders are not closed.</p>
   */
  public synchronized void addIndexes(IndexReader[] readers)
    throws IOException {

    optimize();					  // start with zero or 1 seg

    String mergedName = newSegmentName();
    SegmentMerger merger = new SegmentMerger(directory, mergedName, false);

    if (segmentInfos.size() == 1)                 // add existing index, if any
      merger.add(new SegmentReader(segmentInfos.info(0)));

    for (int i = 0; i < readers.length; i++)      // add new indexes
      merger.add(readers[i]);

    int docCount = merger.merge();                // merge 'em

    segmentInfos.setSize(0);                      // pop old infos & add new
    segmentInfos.addElement(new SegmentInfo(mergedName, docCount, directory));

    synchronized (directory) {			  // in- & inter-process sync
      new Lock.With(directory.makeLock("commit.lock"), COMMIT_LOCK_TIMEOUT) {
	  public Object doBody() throws IOException {
	    segmentInfos.write(directory);	  // commit changes
	    return null;
	  }
	}.run();
    }
  }

  /** Merges all RAM-resident segments. */
  private final void flushRamSegments() throws IOException {
    int minSegment = segmentInfos.size()-1;
    int docCount = 0;
    while (minSegment >= 0 &&
           (segmentInfos.info(minSegment)).dir == ramDirectory) {
      docCount += segmentInfos.info(minSegment).docCount;
      minSegment--;
    }
    if (minSegment < 0 ||			  // add one FS segment?
        (docCount + segmentInfos.info(minSegment).docCount) > mergeFactor ||
        !(segmentInfos.info(segmentInfos.size()-1).dir == ramDirectory))
      minSegment++;
    if (minSegment >= segmentInfos.size())
      return;					  // none to merge
    mergeSegments(minSegment);
  }

  /** Incremental segment merger.  */
  private final void maybeMergeSegments() throws IOException {
    long targetMergeDocs = minMergeDocs;
    while (targetMergeDocs <= maxMergeDocs) {
      // find segments smaller than current target size
      int minSegment = segmentInfos.size();
      int mergeDocs = 0;
      while (--minSegment >= 0) {
        SegmentInfo si = segmentInfos.info(minSegment);
        if (si.docCount >= targetMergeDocs)
          break;
        mergeDocs += si.docCount;
      }

      if (mergeDocs >= targetMergeDocs)		  // found a merge to do
        mergeSegments(minSegment+1);
      else
        break;

      targetMergeDocs *= mergeFactor;		  // increase target size
    }
  }

  /** Pops segments off of segmentInfos stack down to minSegment, merges them,
    and pushes the merged index onto the top of the segmentInfos stack. */
  private final void mergeSegments(int minSegment)
      throws IOException {
    String mergedName = newSegmentName();
    if (infoStream != null) infoStream.print("merging segments");
    SegmentMerger merger =
        new SegmentMerger(directory, mergedName, useCompoundFile);

    final Vector segmentsToDelete = new Vector();
    for (int i = minSegment; i < segmentInfos.size(); i++) {
      SegmentInfo si = segmentInfos.info(i);
      if (infoStream != null)
        infoStream.print(" " + si.name + " (" + si.docCount + " docs)");
      IndexReader reader = new SegmentReader(si);
      merger.add(reader);
      if ((reader.directory() == this.directory) || // if we own the directory
          (reader.directory() == this.ramDirectory))
        segmentsToDelete.addElement(reader);   // queue segment for deletion
    }

    int mergedDocCount = merger.merge();

    if (infoStream != null) {
      infoStream.println(" into "+mergedName+" ("+mergedDocCount+" docs)");
    }

    segmentInfos.setSize(minSegment);          // pop old infos & add new
    segmentInfos.addElement(new SegmentInfo(mergedName, mergedDocCount,
                                            directory));

    // close readers before we attempt to delete now-obsolete segments
    merger.closeReaders();

    synchronized (directory) {                 // in- & inter-process sync
      new Lock.With(directory.makeLock(IndexWriter.COMMIT_LOCK_NAME), COMMIT_LOCK_TIMEOUT) {
          public Object doBody() throws IOException {
            segmentInfos.write(directory);     // commit before deleting
            deleteSegments(segmentsToDelete);  // delete now-unused segments
            return null;
          }
        }.run();
    }

  }

  /* Some operating systems (e.g. Windows) don't permit a file to be deleted
     while it is opened for read (e.g. by another process or thread).  So we
     assume that when a delete fails it is because the file is open in another
     process, and queue the file for subsequent deletion. */

  private final void deleteSegments(Vector segments) throws IOException {
    Vector deletable = new Vector();

    deleteFiles(readDeleteableFiles(), deletable); // try to delete deleteable

    for (int i = 0; i < segments.size(); i++) {
      SegmentReader reader = (SegmentReader)segments.elementAt(i);
      if (reader.directory() == this.directory)
	deleteFiles(reader.files(), deletable);	  // try to delete our files
      else
	deleteFiles(reader.files(), reader.directory()); // delete other files
    }

    writeDeleteableFiles(deletable);		  // note files we can't delete
  }

  private final void deleteFiles(Vector files, Directory directory)
       throws IOException {
    for (int i = 0; i < files.size(); i++)
      directory.deleteFile((String)files.elementAt(i));
  }

  private final void deleteFiles(Vector files, Vector deletable)
       throws IOException {
    for (int i = 0; i < files.size(); i++) {
      String file = (String)files.elementAt(i);
      try {
        directory.deleteFile(file);		  // try to delete each file
      } catch (IOException e) {			  // if delete fails
        if (directory.fileExists(file)) {
          if (infoStream != null)
            infoStream.println(e.getMessage() + "; Will re-try later.");
          deletable.addElement(file);		  // add to deletable
        }
      }
    }
  }

  private final Vector readDeleteableFiles() throws IOException {
    Vector result = new Vector();
    if (!directory.fileExists("deletable"))
      return result;

    InputStream input = directory.openFile("deletable");
    try {
      for (int i = input.readInt(); i > 0; i--)	  // read file names
        result.addElement(input.readString());
    } finally {
      input.close();
    }
    return result;
  }

  private final void writeDeleteableFiles(Vector files) throws IOException {
    OutputStream output = directory.createFile("deleteable.new");
    try {
      output.writeInt(files.size());
      for (int i = 0; i < files.size(); i++)
        output.writeString((String)files.elementAt(i));
    } finally {
      output.close();
    }
    directory.renameFile("deleteable.new", "deletable");
  }
}