FileDocCategorySizeDatePackage
SegmentMerger.javaAPI DocApache Lucene 2.2.016620Sat Jun 16 22:20:36 BST 2007org.apache.lucene.index

SegmentMerger

public final class SegmentMerger extends Object
The SegmentMerger class combines two or more Segments, represented by an IndexReader ({@link #add}, into a single Segment. After adding the appropriate readers, call the merge method to combine the segments.

If the compoundFile flag is set, then the segments will be merged into a compound file.

see
#merge
see
#add

Fields Summary
static final byte[]
NORMS_HEADER
norms header placeholder
private Directory
directory
private String
segment
private int
termIndexInterval
private Vector
readers
private FieldInfos
fieldInfos
private int
mergedDocs
private IndexOutput
freqOutput
private IndexOutput
proxOutput
private TermInfosWriter
termInfosWriter
private int
skipInterval
private int
maxSkipLevels
private SegmentMergeQueue
queue
private DefaultSkipListWriter
skipListWriter
private final TermInfo
termInfo
private byte[]
payloadBuffer
Constructors Summary
SegmentMerger(Directory dir, String name)
This ctor used only by test code.

param
dir The Directory to merge the other segments into
param
name The name of the new segment


                               
      
    directory = dir;
    segment = name;
  
SegmentMerger(IndexWriter writer, String name)

    directory = writer.getDirectory();
    segment = name;
    termIndexInterval = writer.getTermIndexInterval();
  
Methods Summary
final voidadd(org.apache.lucene.index.IndexReader reader)
Add an IndexReader to the collection of readers that are to be merged

param
reader

    readers.addElement(reader);
  
private voidaddIndexed(org.apache.lucene.index.IndexReader reader, org.apache.lucene.index.FieldInfos fieldInfos, java.util.Collection names, boolean storeTermVectors, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector, boolean storePayloads)

    Iterator i = names.iterator();
    while (i.hasNext()) {
      String field = (String)i.next();
      fieldInfos.add(field, true, storeTermVectors, storePositionWithTermVector, storeOffsetWithTermVector, !reader.hasNorms(field), storePayloads);
    }
  
private final intappendPostings(org.apache.lucene.index.SegmentMergeInfo[] smis, int n)
Process postings from multiple segments all positioned on the same term. Writes out merged entries into freqOutput and the proxOutput streams.

param
smis array of segments
param
n number of cells in the array actually occupied
return
number of documents across all segments where this term was found
throws
CorruptIndexException if the index is corrupt
throws
IOException if there is a low-level IO error


                                                                       
        
             
    int lastDoc = 0;
    int df = 0;					  // number of docs w/ term
    skipListWriter.resetSkip();
    boolean storePayloads = fieldInfos.fieldInfo(smis[0].term.field).storePayloads;
    int lastPayloadLength = -1;   // ensures that we write the first length
    for (int i = 0; i < n; i++) {
      SegmentMergeInfo smi = smis[i];
      TermPositions postings = smi.getPositions();
      int base = smi.base;
      int[] docMap = smi.getDocMap();
      postings.seek(smi.termEnum);
      while (postings.next()) {
        int doc = postings.doc();
        if (docMap != null)
          doc = docMap[doc];                      // map around deletions
        doc += base;                              // convert to merged space

        if (doc < 0 || (df > 0 && doc <= lastDoc))
          throw new CorruptIndexException("docs out of order (" + doc +
              " <= " + lastDoc + " )");

        df++;

        if ((df % skipInterval) == 0) {
          skipListWriter.setSkipData(lastDoc, storePayloads, lastPayloadLength);
          skipListWriter.bufferSkip(df);
        }

        int docCode = (doc - lastDoc) << 1;	  // use low bit to flag freq=1
        lastDoc = doc;

        int freq = postings.freq();
        if (freq == 1) {
          freqOutput.writeVInt(docCode | 1);	  // write doc & freq=1
        } else {
          freqOutput.writeVInt(docCode);	  // write doc
          freqOutput.writeVInt(freq);		  // write frequency in doc
        }
        
        /** See {@link DocumentWriter#writePostings(Posting[], String) for 
         *  documentation about the encoding of positions and payloads
         */
        int lastPosition = 0;			  // write position deltas
        for (int j = 0; j < freq; j++) {
          int position = postings.nextPosition();
          int delta = position - lastPosition;
          if (storePayloads) {
            int payloadLength = postings.getPayloadLength();
            if (payloadLength == lastPayloadLength) {
              proxOutput.writeVInt(delta * 2);
            } else {
              proxOutput.writeVInt(delta * 2 + 1);
              proxOutput.writeVInt(payloadLength);
              lastPayloadLength = payloadLength;
            }
            if (payloadLength > 0) {
              if (payloadBuffer == null || payloadBuffer.length < payloadLength) {
                payloadBuffer = new byte[payloadLength];
              }
              postings.getPayload(payloadBuffer, 0);
              proxOutput.writeBytes(payloadBuffer, 0, payloadLength);
            }
          } else {
            proxOutput.writeVInt(delta);
          }
          lastPosition = position;
        }
      }
    }
    return df;
  
final voidcloseReaders()
close all IndexReaders that have been added. Should not be called before merge().

throws
IOException

    for (int i = 0; i < readers.size(); i++) {  // close readers
      IndexReader reader = (IndexReader) readers.elementAt(i);
      reader.close();
    }
  
final java.util.VectorcreateCompoundFile(java.lang.String fileName)

    CompoundFileWriter cfsWriter =
            new CompoundFileWriter(directory, fileName);

    Vector files =
      new Vector(IndexFileNames.COMPOUND_EXTENSIONS.length + 1);    
    
    // Basic files
    for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) {
      files.add(segment + "." + IndexFileNames.COMPOUND_EXTENSIONS[i]);
    }

    // Fieldable norm files
    for (int i = 0; i < fieldInfos.size(); i++) {
      FieldInfo fi = fieldInfos.fieldInfo(i);
      if (fi.isIndexed && !fi.omitNorms) {
        files.add(segment + "." + IndexFileNames.NORMS_EXTENSION);
        break;
      }
    }

    // Vector files
    if (fieldInfos.hasVectors()) {
      for (int i = 0; i < IndexFileNames.VECTOR_EXTENSIONS.length; i++) {
        files.add(segment + "." + IndexFileNames.VECTOR_EXTENSIONS[i]);
      }
    }

    // Now merge all added files
    Iterator it = files.iterator();
    while (it.hasNext()) {
      cfsWriter.addFile((String) it.next());
    }
    
    // Perform the merge
    cfsWriter.close();
   
    return files;
  
final intmerge()
Merges the readers specified by the {@link #add} method into the directory passed to the constructor

return
The number of documents that were merged
throws
CorruptIndexException if the index is corrupt
throws
IOException if there is a low-level IO error

    int value;
    
    mergedDocs = mergeFields();
    mergeTerms();
    mergeNorms();

    if (fieldInfos.hasVectors())
      mergeVectors();

    return mergedDocs;
  
private final intmergeFields()

return
The number of documents in all of the readers
throws
CorruptIndexException if the index is corrupt
throws
IOException if there is a low-level IO error

    fieldInfos = new FieldInfos();		  // merge field names
    int docCount = 0;
    for (int i = 0; i < readers.size(); i++) {
      IndexReader reader = (IndexReader) readers.elementAt(i);
      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION_OFFSET), true, true, true, false);
      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_POSITION), true, true, false, false);
      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR_WITH_OFFSET), true, false, true, false);
      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.TERMVECTOR), true, false, false, false);
      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.STORES_PAYLOADS), false, false, false, true);
      addIndexed(reader, fieldInfos, reader.getFieldNames(IndexReader.FieldOption.INDEXED), false, false, false, false);
      fieldInfos.add(reader.getFieldNames(IndexReader.FieldOption.UNINDEXED), false);
    }
    fieldInfos.write(directory, segment + ".fnm");

    FieldsWriter fieldsWriter = // merge field values
            new FieldsWriter(directory, segment, fieldInfos);
    
    // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader that we're
    // in  merge mode, we use this FieldSelector
    FieldSelector fieldSelectorMerge = new FieldSelector() {
      public FieldSelectorResult accept(String fieldName) {
        return FieldSelectorResult.LOAD_FOR_MERGE;
      }        
    };
    
    try {
      for (int i = 0; i < readers.size(); i++) {
        IndexReader reader = (IndexReader) readers.elementAt(i);
        int maxDoc = reader.maxDoc();
        for (int j = 0; j < maxDoc; j++)
          if (!reader.isDeleted(j)) {               // skip deleted docs
            fieldsWriter.addDocument(reader.document(j, fieldSelectorMerge));
            docCount++;
          }
      }
    } finally {
      fieldsWriter.close();
    }
    return docCount;
  
private voidmergeNorms()

    byte[] normBuffer = null;
    IndexOutput output = null;
    try {
      for (int i = 0; i < fieldInfos.size(); i++) {
        FieldInfo fi = fieldInfos.fieldInfo(i);
        if (fi.isIndexed && !fi.omitNorms) {
          if (output == null) { 
            output = directory.createOutput(segment + "." + IndexFileNames.NORMS_EXTENSION);
            output.writeBytes(NORMS_HEADER,NORMS_HEADER.length);
          }
          for (int j = 0; j < readers.size(); j++) {
            IndexReader reader = (IndexReader) readers.elementAt(j);
            int maxDoc = reader.maxDoc();
            if (normBuffer == null || normBuffer.length < maxDoc) {
              // the buffer is too small for the current segment
              normBuffer = new byte[maxDoc];
            }
            reader.norms(fi.name, normBuffer, 0);
            if (!reader.hasDeletions()) {
              //optimized case for segments without deleted docs
              output.writeBytes(normBuffer, maxDoc);
            } else {
              // this segment has deleted docs, so we have to
              // check for every doc if it is deleted or not
              for (int k = 0; k < maxDoc; k++) {
                if (!reader.isDeleted(k)) {
                  output.writeByte(normBuffer[k]);
                }
              }
            }
          }
        }
      }
    } finally {
      if (output != null) { 
        output.close();
      }
    }
  
private final voidmergeTermInfo(org.apache.lucene.index.SegmentMergeInfo[] smis, int n)
Merge one term found in one or more segments. The array smis contains segments that are positioned at the same term. N is the number of cells in the array actually occupied.

param
smis array of segments
param
n number of cells in the array actually occupied
throws
CorruptIndexException if the index is corrupt
throws
IOException if there is a low-level IO error

 // minimize consing

                                                                      
        
             
    long freqPointer = freqOutput.getFilePointer();
    long proxPointer = proxOutput.getFilePointer();

    int df = appendPostings(smis, n);		  // append posting data

    long skipPointer = skipListWriter.writeSkip(freqOutput);

    if (df > 0) {
      // add an entry to the dictionary with pointers to prox and freq files
      termInfo.set(df, freqPointer, proxPointer, (int) (skipPointer - freqPointer));
      termInfosWriter.add(smis[0].term, termInfo);
    }
  
private final voidmergeTermInfos()

    int base = 0;
    for (int i = 0; i < readers.size(); i++) {
      IndexReader reader = (IndexReader) readers.elementAt(i);
      TermEnum termEnum = reader.terms();
      SegmentMergeInfo smi = new SegmentMergeInfo(base, termEnum, reader);
      base += reader.numDocs();
      if (smi.next())
        queue.put(smi);				  // initialize queue
      else
        smi.close();
    }

    SegmentMergeInfo[] match = new SegmentMergeInfo[readers.size()];

    while (queue.size() > 0) {
      int matchSize = 0;			  // pop matching terms
      match[matchSize++] = (SegmentMergeInfo) queue.pop();
      Term term = match[0].term;
      SegmentMergeInfo top = (SegmentMergeInfo) queue.top();

      while (top != null && term.compareTo(top.term) == 0) {
        match[matchSize++] = (SegmentMergeInfo) queue.pop();
        top = (SegmentMergeInfo) queue.top();
      }

      mergeTermInfo(match, matchSize);		  // add new TermInfo

      while (matchSize > 0) {
        SegmentMergeInfo smi = match[--matchSize];
        if (smi.next())
          queue.put(smi);			  // restore queue
        else
          smi.close();				  // done with a segment
      }
    }
  
private final voidmergeTerms()


         
    try {
      freqOutput = directory.createOutput(segment + ".frq");
      proxOutput = directory.createOutput(segment + ".prx");
      termInfosWriter =
              new TermInfosWriter(directory, segment, fieldInfos,
                                  termIndexInterval);
      skipInterval = termInfosWriter.skipInterval;
      maxSkipLevels = termInfosWriter.maxSkipLevels;
      skipListWriter = new DefaultSkipListWriter(skipInterval, maxSkipLevels, mergedDocs, freqOutput, proxOutput);
      queue = new SegmentMergeQueue(readers.size());

      mergeTermInfos();

    } finally {
      if (freqOutput != null) freqOutput.close();
      if (proxOutput != null) proxOutput.close();
      if (termInfosWriter != null) termInfosWriter.close();
      if (queue != null) queue.close();
    }
  
private final voidmergeVectors()
Merge the TermVectors from each of the segments into the new one.

throws
IOException

    TermVectorsWriter termVectorsWriter = 
      new TermVectorsWriter(directory, segment, fieldInfos);

    try {
      for (int r = 0; r < readers.size(); r++) {
        IndexReader reader = (IndexReader) readers.elementAt(r);
        int maxDoc = reader.maxDoc();
        for (int docNum = 0; docNum < maxDoc; docNum++) {
          // skip deleted docs
          if (reader.isDeleted(docNum)) 
            continue;
          termVectorsWriter.addAllDocVectors(reader.getTermFreqVectors(docNum));
        }
      }
    } finally {
      termVectorsWriter.close();
    }
  
final org.apache.lucene.index.IndexReadersegmentReader(int i)

param
i The index of the reader to return
return
The ith reader to be merged

    return (IndexReader) readers.elementAt(i);