FileDocCategorySizeDatePackage
TermVectorsWriter.javaAPI DocApache Lucene 1.911545Mon Feb 20 09:20:14 GMT 2006org.apache.lucene.index

TermVectorsWriter

public final class TermVectorsWriter extends Object
Writer works by opening a document and then opening the fields within the document and then writing out the vectors for each field. Rough usage: for each document { writer.openDocument(); for each field on the document { writer.openField(field); for all of the terms { writer.addTerm(...) } writer.closeField } writer.closeDocument() }
version
$Id: TermVectorsWriter.java 150689 2004-11-29 21:42:02Z bmesser $

Fields Summary
static final byte
STORE_POSITIONS_WITH_TERMVECTOR
static final byte
STORE_OFFSET_WITH_TERMVECTOR
static final int
FORMAT_VERSION
static final int
FORMAT_SIZE
static final String
TVX_EXTENSION
static final String
TVD_EXTENSION
static final String
TVF_EXTENSION
private IndexOutput
tvx
private IndexOutput
tvd
private IndexOutput
tvf
private Vector
fields
private Vector
terms
private FieldInfos
fieldInfos
private TVField
currentField
private long
currentDocPointer
Constructors Summary
public TermVectorsWriter(Directory directory, String segment, FieldInfos fieldInfos)


      
                            
      
    // Open files for TermVector storage
    tvx = directory.createOutput(segment + TVX_EXTENSION);
    tvx.writeInt(FORMAT_VERSION);
    tvd = directory.createOutput(segment + TVD_EXTENSION);
    tvd.writeInt(FORMAT_VERSION);
    tvf = directory.createOutput(segment + TVF_EXTENSION);
    tvf.writeInt(FORMAT_VERSION);

    this.fieldInfos = fieldInfos;
    fields = new Vector(fieldInfos.size());
    terms = new Vector();
  
Methods Summary
public final voidaddAllDocVectors(org.apache.lucene.index.TermFreqVector[] vectors)
Add a complete document specified by all its term vectors. If document has no term vectors, add value for tvx.

param
vectors
throws
IOException

    openDocument();

    if (vectors != null) {
      for (int i = 0; i < vectors.length; i++) {
        boolean storePositionWithTermVector = false;
        boolean storeOffsetWithTermVector = false;

        try {

          TermPositionVector tpVector = (TermPositionVector) vectors[i];

          if (tpVector.size() > 0 && tpVector.getTermPositions(0) != null)
            storePositionWithTermVector = true;
          if (tpVector.size() > 0 && tpVector.getOffsets(0) != null)
            storeOffsetWithTermVector = true;

          FieldInfo fieldInfo = fieldInfos.fieldInfo(tpVector.getField());
          openField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);

          for (int j = 0; j < tpVector.size(); j++)
            addTermInternal(tpVector.getTerms()[j], tpVector.getTermFrequencies()[j], tpVector.getTermPositions(j),
                tpVector.getOffsets(j));

          closeField();

        } catch (ClassCastException ignore) {

          TermFreqVector tfVector = vectors[i];

          FieldInfo fieldInfo = fieldInfos.fieldInfo(tfVector.getField());
          openField(fieldInfo.number, storePositionWithTermVector, storeOffsetWithTermVector);

          for (int j = 0; j < tfVector.size(); j++)
            addTermInternal(tfVector.getTerms()[j], tfVector.getTermFrequencies()[j], null, null);

          closeField();

        }
      }
    }

    closeDocument();
  
public final voidaddTerm(java.lang.String termText, int freq, int[] positions, org.apache.lucene.index.TermVectorOffsetInfo[] offsets)

    if (!isDocumentOpen()) 
      throw new IllegalStateException("Cannot add terms when document is not open");
    if (!isFieldOpen()) 
      throw new IllegalStateException("Cannot add terms when field is not open");
    
    addTermInternal(termText, freq, positions, offsets);
  
public final voidaddTerm(java.lang.String termText, int freq)
Add term to the field's term vector. Field must already be open. Terms should be added in increasing order of terms, one call per unique termNum. ProxPointer is a pointer into the TermPosition file (prx). Freq is the number of times this term appears in this field, in this document.

throws
IllegalStateException if document or field is not open

    addTerm(termText, freq, null, null);
  
private final voidaddTermInternal(java.lang.String termText, int freq, int[] positions, org.apache.lucene.index.TermVectorOffsetInfo[] offsets)

    TVTerm term = new TVTerm();
    term.termText = termText;
    term.freq = freq;
    term.positions = positions;
    term.offsets = offsets;
    terms.add(term);
  
final voidclose()
Close all streams.

    try {
      closeDocument();
    } finally {
      // make an effort to close all streams we can but remember and re-throw
      // the first exception encountered in this process
      IOException keep = null;
      if (tvx != null)
        try {
          tvx.close();
        } catch (IOException e) {
          if (keep == null) keep = e;
        }
      if (tvd != null)
        try {
          tvd.close();
        } catch (IOException e) {
          if (keep == null) keep = e;
        }
      if (tvf != null)
        try {
          tvf.close();
        } catch (IOException e) {
          if (keep == null) keep = e;
        }
      if (keep != null) throw (IOException) keep.fillInStackTrace();
    }
  
public final voidcloseDocument()

    if (isDocumentOpen()) {
      closeField();
      writeDoc();
      fields.clear();
      currentDocPointer = -1;
    }
  
public final voidcloseField()
Finished processing current field. This should be followed by a call to openField before future calls to addTerm.

    if (isFieldOpen()) {
      /* DEBUG */
      //System.out.println("closeField()");
      /* DEBUG */

      // save field and terms
      writeField();
      fields.add(currentField);
      terms.clear();
      currentField = null;
    }
  
public final booleanisDocumentOpen()

    return currentDocPointer != -1;
  
public final booleanisFieldOpen()
Return true if a field is currently open.

    return currentField != null;
  
public final voidopenDocument()

    closeDocument();
    currentDocPointer = tvd.getFilePointer();
  
public final voidopenField(java.lang.String field)
Start processing a field. This can be followed by a number of calls to addTerm, and a final call to closeField to indicate the end of processing of this field. If a field was previously open, it is closed automatically.

    FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
    openField(fieldInfo.number, fieldInfo.storePositionWithTermVector, fieldInfo.storeOffsetWithTermVector);
  
private voidopenField(int fieldNumber, boolean storePositionWithTermVector, boolean storeOffsetWithTermVector)

    if (!isDocumentOpen()) 
      throw new IllegalStateException("Cannot open field when no document is open.");
    closeField();
    currentField = new TVField(fieldNumber, storePositionWithTermVector, storeOffsetWithTermVector);
  
private voidwriteDoc()

    if (isFieldOpen()) 
      throw new IllegalStateException("Field is still open while writing document");
    //System.out.println("Writing doc pointer: " + currentDocPointer);
    // write document index record
    tvx.writeLong(currentDocPointer);

    // write document data record
    final int size = fields.size();

    // write the number of fields
    tvd.writeVInt(size);

    // write field numbers
    for (int i = 0; i < size; i++) {
      TVField field = (TVField) fields.elementAt(i);
      tvd.writeVInt(field.number);
    }

    // write field pointers
    long lastFieldPointer = 0;
    for (int i = 0; i < size; i++) {
      TVField field = (TVField) fields.elementAt(i);
      tvd.writeVLong(field.tvfPointer - lastFieldPointer);
      lastFieldPointer = field.tvfPointer;
    }
    //System.out.println("After writing doc pointer: " + tvx.getFilePointer());
  
private voidwriteField()

    // remember where this field is written
    currentField.tvfPointer = tvf.getFilePointer();
    //System.out.println("Field Pointer: " + currentField.tvfPointer);
    
    final int size = terms.size();
    tvf.writeVInt(size);
    
    boolean storePositions = currentField.storePositions;
    boolean storeOffsets = currentField.storeOffsets;
    byte bits = 0x0;
    if (storePositions) 
      bits |= STORE_POSITIONS_WITH_TERMVECTOR;
    if (storeOffsets) 
      bits |= STORE_OFFSET_WITH_TERMVECTOR;
    tvf.writeByte(bits);
    
    String lastTermText = "";
    for (int i = 0; i < size; i++) {
      TVTerm term = (TVTerm) terms.elementAt(i);
      int start = StringHelper.stringDifference(lastTermText, term.termText);
      int length = term.termText.length() - start;
      tvf.writeVInt(start);       // write shared prefix length
      tvf.writeVInt(length);        // write delta length
      tvf.writeChars(term.termText, start, length);  // write delta chars
      tvf.writeVInt(term.freq);
      lastTermText = term.termText;
      
      if(storePositions){
        if(term.positions == null)
          throw new IllegalStateException("Trying to write positions that are null!");
        
        // use delta encoding for positions
        int position = 0;
        for (int j = 0; j < term.freq; j++){
          tvf.writeVInt(term.positions[j] - position);
          position = term.positions[j];
        }
      }
      
      if(storeOffsets){
        if(term.offsets == null)
          throw new IllegalStateException("Trying to write offsets that are null!");
        
        // use delta encoding for offsets
        int position = 0;
        for (int j = 0; j < term.freq; j++) {
          tvf.writeVInt(term.offsets[j].getStartOffset() - position);
          tvf.writeVInt(term.offsets[j].getEndOffset() - term.offsets[j].getStartOffset()); //Save the diff between the two.
          position = term.offsets[j].getEndOffset();
        }
      }
    }