FileDocCategorySizeDatePackage
TermVectorsReader.javaAPI DocApache Lucene 2.2.010915Sat Jun 16 22:20:36 BST 2007org.apache.lucene.index

TermVectorsReader

public class TermVectorsReader extends Object implements Cloneable
version
$Id: TermVectorsReader.java 542561 2007-05-29 15:14:07Z mikemccand $

Fields Summary
private FieldInfos
fieldInfos
private IndexInput
tvx
private IndexInput
tvd
private IndexInput
tvf
private int
size
private int
tvdFormat
private int
tvfFormat
Constructors Summary
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos)

    this(d, segment, fieldInfos, BufferedIndexInput.BUFFER_SIZE);
  
TermVectorsReader(Directory d, String segment, FieldInfos fieldInfos, int readBufferSize)

    if (d.fileExists(segment + TermVectorsWriter.TVX_EXTENSION)) {
      tvx = d.openInput(segment + TermVectorsWriter.TVX_EXTENSION, readBufferSize);
      checkValidFormat(tvx);
      tvd = d.openInput(segment + TermVectorsWriter.TVD_EXTENSION, readBufferSize);
      tvdFormat = checkValidFormat(tvd);
      tvf = d.openInput(segment + TermVectorsWriter.TVF_EXTENSION, readBufferSize);
      tvfFormat = checkValidFormat(tvf);
      size = (int) tvx.length() / 8;
    }

    this.fieldInfos = fieldInfos;
  
Methods Summary
private intcheckValidFormat(org.apache.lucene.store.IndexInput in)

    int format = in.readInt();
    if (format > TermVectorsWriter.FORMAT_VERSION)
    {
      throw new CorruptIndexException("Incompatible format version: " + format + " expected " 
                                      + TermVectorsWriter.FORMAT_VERSION + " or less");
    }
    return format;
  
protected java.lang.Objectclone()

    
    if (tvx == null || tvd == null || tvf == null)
      return null;
    
    TermVectorsReader clone = null;
    try {
      clone = (TermVectorsReader) super.clone();
    } catch (CloneNotSupportedException e) {}

    clone.tvx = (IndexInput) tvx.clone();
    clone.tvd = (IndexInput) tvd.clone();
    clone.tvf = (IndexInput) tvf.clone();
    
    return clone;
  
voidclose()

  	// make all effort to close up. Keep the first exception
  	// and throw it as a new one.
  	IOException keep = null;
  	if (tvx != null) try { tvx.close(); } catch (IOException e) { if (keep == null) keep = e; }
  	if (tvd != null) try { tvd.close(); } catch (IOException e) { if (keep == null) keep = e; }
  	if (tvf  != null) try {  tvf.close(); } catch (IOException e) { if (keep == null) keep = e; }
  	if (keep != null) throw (IOException) keep.fillInStackTrace();
  
org.apache.lucene.index.TermFreqVectorget(int docNum, java.lang.String field)
Retrieve the term vector for the given document and field

param
docNum The document number to retrieve the vector for
param
field The field within the document to retrieve
return
The TermFreqVector for the document and field or null if there is no termVector for this field.
throws
IOException if there is an error reading the term vector files

    // Check if no term vectors are available for this segment at all
    int fieldNumber = fieldInfos.fieldNumber(field);
    TermFreqVector result = null;
    if (tvx != null) {
      //We need to account for the FORMAT_SIZE at when seeking in the tvx
      //We don't need to do this in other seeks because we already have the
      // file pointer
      //that was written in another file
      tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
      //System.out.println("TVX Pointer: " + tvx.getFilePointer());
      long position = tvx.readLong();

      tvd.seek(position);
      int fieldCount = tvd.readVInt();
      //System.out.println("Num Fields: " + fieldCount);
      // There are only a few fields per document. We opt for a full scan
      // rather then requiring that they be ordered. We need to read through
      // all of the fields anyway to get to the tvf pointers.
      int number = 0;
      int found = -1;
      for (int i = 0; i < fieldCount; i++) {
        if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
          number = tvd.readVInt();
        else
          number += tvd.readVInt();
        
        if (number == fieldNumber)
          found = i;
      }

      // This field, although valid in the segment, was not found in this
      // document
      if (found != -1) {
        // Compute position in the tvf file
        position = 0;
        for (int i = 0; i <= found; i++)
          position += tvd.readVLong();

        result = readTermVector(field, position);
      } else {
        //System.out.println("Fieldable not found");
      }
    } else {
      //System.out.println("No tvx file");
    }
    return result;
  
org.apache.lucene.index.TermFreqVector[]get(int docNum)
Return all term vectors stored for this document or null if the could not be read in.

param
docNum The document number to retrieve the vector for
return
All term frequency vectors
throws
IOException if there is an error reading the term vector files

    TermFreqVector[] result = null;
    // Check if no term vectors are available for this segment at all
    if (tvx != null) {
      //We need to offset by
      tvx.seek((docNum * 8L) + TermVectorsWriter.FORMAT_SIZE);
      long position = tvx.readLong();

      tvd.seek(position);
      int fieldCount = tvd.readVInt();

      // No fields are vectorized for this document
      if (fieldCount != 0) {
        int number = 0;
        String[] fields = new String[fieldCount];
        
        for (int i = 0; i < fieldCount; i++) {
          if(tvdFormat == TermVectorsWriter.FORMAT_VERSION)
            number = tvd.readVInt();
          else
            number += tvd.readVInt();

          fields[i] = fieldInfos.fieldName(number);
        }

        // Compute position in the tvf file
        position = 0;
        long[] tvfPointers = new long[fieldCount];
        for (int i = 0; i < fieldCount; i++) {
          position += tvd.readVLong();
          tvfPointers[i] = position;
        }

        result = readTermVectors(fields, tvfPointers);
      }
    } else {
      //System.out.println("No tvx file");
    }
    return result;
  
private org.apache.lucene.index.SegmentTermVectorreadTermVector(java.lang.String field, long tvfPointer)

param
field The field to read in
param
tvfPointer The pointer within the tvf file where we should start reading
return
The TermVector located at that position
throws
IOException


    // Now read the data from specified position
    //We don't need to offset by the FORMAT here since the pointer already includes the offset
    tvf.seek(tvfPointer);

    int numTerms = tvf.readVInt();
    //System.out.println("Num Terms: " + numTerms);
    // If no terms - return a constant empty termvector. However, this should never occur!
    if (numTerms == 0) 
      return new SegmentTermVector(field, null, null);
    
    boolean storePositions;
    boolean storeOffsets;
    
    if(tvfFormat == TermVectorsWriter.FORMAT_VERSION){
      byte bits = tvf.readByte();
      storePositions = (bits & TermVectorsWriter.STORE_POSITIONS_WITH_TERMVECTOR) != 0;
      storeOffsets = (bits & TermVectorsWriter.STORE_OFFSET_WITH_TERMVECTOR) != 0;
    }
    else{
      tvf.readVInt();
      storePositions = false;
      storeOffsets = false;
    }

    String terms[] = new String[numTerms];
    int termFreqs[] = new int[numTerms];
    
    //  we may not need these, but declare them
    int positions[][] = null;
    TermVectorOffsetInfo offsets[][] = null;
    if(storePositions)
      positions = new int[numTerms][];
    if(storeOffsets)
      offsets = new TermVectorOffsetInfo[numTerms][];
    
    int start = 0;
    int deltaLength = 0;
    int totalLength = 0;
    char [] buffer = new char[10];    // init the buffer with a length of 10 character
    char[] previousBuffer = {};
    
    for (int i = 0; i < numTerms; i++) {
      start = tvf.readVInt();
      deltaLength = tvf.readVInt();
      totalLength = start + deltaLength;
      if (buffer.length < totalLength) {  // increase buffer
        buffer = null;    // give a hint to garbage collector
        buffer = new char[totalLength];
        
        if (start > 0)  // just copy if necessary
          System.arraycopy(previousBuffer, 0, buffer, 0, start);
      }
      
      tvf.readChars(buffer, start, deltaLength);
      terms[i] = new String(buffer, 0, totalLength);
      previousBuffer = buffer;
      int freq = tvf.readVInt();
      termFreqs[i] = freq;
      
      if (storePositions) { //read in the positions
        int [] pos = new int[freq];
        positions[i] = pos;
        int prevPosition = 0;
        for (int j = 0; j < freq; j++)
        {
          pos[j] = prevPosition + tvf.readVInt();
          prevPosition = pos[j];
        }
      }
      
      if (storeOffsets) {
        TermVectorOffsetInfo[] offs = new TermVectorOffsetInfo[freq];
        offsets[i] = offs;
        int prevOffset = 0;
        for (int j = 0; j < freq; j++) {
          int startOffset = prevOffset + tvf.readVInt();
          int endOffset = startOffset + tvf.readVInt();
          offs[j] = new TermVectorOffsetInfo(startOffset, endOffset);
          prevOffset = endOffset;
        }
      }
    }
    
    SegmentTermVector tv;
    if (storePositions || storeOffsets){
      tv = new SegmentTermPositionVector(field, terms, termFreqs, positions, offsets);
    }
    else {
      tv = new SegmentTermVector(field, terms, termFreqs);
    }
    return tv;
  
private org.apache.lucene.index.SegmentTermVector[]readTermVectors(java.lang.String[] fields, long[] tvfPointers)

    SegmentTermVector res[] = new SegmentTermVector[fields.length];
    for (int i = 0; i < fields.length; i++) {
      res[i] = readTermVector(fields[i], tvfPointers[i]);
    }
    return res;
  
intsize()

return
The number of documents in the reader

    return size;