FieldsReader.java (Apache Lucene 2.2.0)

File	Doc	Category	Size	Date	Package
FieldsReader.java	API Doc	Apache Lucene 2.2.0	17750	Sat Jun 16 22:20:36 BST 2007	org.apache.lucene.index
FieldsReader.java

package org.apache.lucene.index;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.*;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.AlreadyClosedException;
import org.apache.lucene.store.BufferedIndexInput;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.Reader;
import java.util.zip.DataFormatException;
import java.util.zip.Inflater;

/**
 * Class responsible for access to stored document fields.
 * <p/>
 * It uses <segment>.fdt and <segment>.fdx; files.
 *
 * @version $Id: FieldsReader.java 542561 2007-05-29 15:14:07Z mikemccand $
 */
final class FieldsReader {
  private final FieldInfos fieldInfos;

  // The main fieldStream, used only for cloning.
  private final IndexInput cloneableFieldsStream;

  // This is a clone of cloneableFieldsStream used for reading documents.
  // It should not be cloned outside of a synchronized context.
  private final IndexInput fieldsStream;

  private final IndexInput indexStream;
  private int size;
  private boolean closed;

  private ThreadLocal fieldsStreamTL = new ThreadLocal();

  FieldsReader(Directory d, String segment, FieldInfos fn) throws IOException {
    this(d, segment, fn, BufferedIndexInput.BUFFER_SIZE);
  }

  FieldsReader(Directory d, String segment, FieldInfos fn, int readBufferSize) throws IOException {
    fieldInfos = fn;

    cloneableFieldsStream = d.openInput(segment + ".fdt", readBufferSize);
    fieldsStream = (IndexInput)cloneableFieldsStream.clone();
    indexStream = d.openInput(segment + ".fdx", readBufferSize);
    size = (int) (indexStream.length() / 8);
  }

  /**
   * @throws AlreadyClosedException if this FieldsReader is closed
   */
  protected final void ensureOpen() throws AlreadyClosedException {
    if (closed) {
      throw new AlreadyClosedException("this FieldsReader is closed");
    }
  }

  /**
   * Closes the underlying {@link org.apache.lucene.store.IndexInput} streams, including any ones associated with a
   * lazy implementation of a Field.  This means that the Fields values will not be accessible.
   *
   * @throws IOException
   */
  final void close() throws IOException {
    if (!closed) {
      fieldsStream.close();
      cloneableFieldsStream.close();
      indexStream.close();
      IndexInput localFieldsStream = (IndexInput) fieldsStreamTL.get();
      if (localFieldsStream != null) {
        localFieldsStream.close();
        fieldsStreamTL.set(null);
      }
      closed = true;
    }
  }

  final int size() {
    return size;
  }

  final Document doc(int n, FieldSelector fieldSelector) throws CorruptIndexException, IOException {
    indexStream.seek(n * 8L);
    long position = indexStream.readLong();
    fieldsStream.seek(position);

    Document doc = new Document();
    int numFields = fieldsStream.readVInt();
    for (int i = 0; i < numFields; i++) {
      int fieldNumber = fieldsStream.readVInt();
      FieldInfo fi = fieldInfos.fieldInfo(fieldNumber);
      FieldSelectorResult acceptField = fieldSelector == null ? FieldSelectorResult.LOAD : fieldSelector.accept(fi.name);
      
      byte bits = fieldsStream.readByte();
      boolean compressed = (bits & FieldsWriter.FIELD_IS_COMPRESSED) != 0;
      boolean tokenize = (bits & FieldsWriter.FIELD_IS_TOKENIZED) != 0;
      boolean binary = (bits & FieldsWriter.FIELD_IS_BINARY) != 0;
      //TODO: Find an alternative approach here if this list continues to grow beyond the
      //list of 5 or 6 currently here.  See Lucene 762 for discussion
      if (acceptField.equals(FieldSelectorResult.LOAD)) {
        addField(doc, fi, binary, compressed, tokenize);
      }
      else if (acceptField.equals(FieldSelectorResult.LOAD_FOR_MERGE)) {
        addFieldForMerge(doc, fi, binary, compressed, tokenize);
      }
      else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK)){
        addField(doc, fi, binary, compressed, tokenize);
        break;//Get out of this loop
      }
      else if (acceptField.equals(FieldSelectorResult.LAZY_LOAD)) {
        addFieldLazy(doc, fi, binary, compressed, tokenize);
      }
      else if (acceptField.equals(FieldSelectorResult.SIZE)){
        skipField(binary, compressed, addFieldSize(doc, fi, binary, compressed));
      }
      else if (acceptField.equals(FieldSelectorResult.SIZE_AND_BREAK)){
        addFieldSize(doc, fi, binary, compressed);
        break;
      }
      else {
        skipField(binary, compressed);
      }
    }

    return doc;
  }

  /**
   * Skip the field.  We still have to read some of the information about the field, but can skip past the actual content.
   * This will have the most payoff on large fields.
   */
  private void skipField(boolean binary, boolean compressed) throws IOException {
    skipField(binary, compressed, fieldsStream.readVInt());
  }
  
  private void skipField(boolean binary, boolean compressed, int toRead) throws IOException {
    if (binary || compressed) {
      long pointer = fieldsStream.getFilePointer();
      fieldsStream.seek(pointer + toRead);
    } else {
      //We need to skip chars.  This will slow us down, but still better
      fieldsStream.skipChars(toRead);
    }
  }

  private void addFieldLazy(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException {
    if (binary == true) {
      int toRead = fieldsStream.readVInt();
      long pointer = fieldsStream.getFilePointer();
      if (compressed) {
        //was: doc.add(new Fieldable(fi.name, uncompress(b), Fieldable.Store.COMPRESS));
        doc.add(new LazyField(fi.name, Field.Store.COMPRESS, toRead, pointer));
      } else {
        //was: doc.add(new Fieldable(fi.name, b, Fieldable.Store.YES));
        doc.add(new LazyField(fi.name, Field.Store.YES, toRead, pointer));
      }
      //Need to move the pointer ahead by toRead positions
      fieldsStream.seek(pointer + toRead);
    } else {
      Field.Store store = Field.Store.YES;
      Field.Index index = getIndexType(fi, tokenize);
      Field.TermVector termVector = getTermVectorType(fi);

      Fieldable f;
      if (compressed) {
        store = Field.Store.COMPRESS;
        int toRead = fieldsStream.readVInt();
        long pointer = fieldsStream.getFilePointer();
        f = new LazyField(fi.name, store, toRead, pointer);
        //skip over the part that we aren't loading
        fieldsStream.seek(pointer + toRead);
        f.setOmitNorms(fi.omitNorms);
      } else {
        int length = fieldsStream.readVInt();
        long pointer = fieldsStream.getFilePointer();
        //Skip ahead of where we are by the length of what is stored
        fieldsStream.skipChars(length);
        f = new LazyField(fi.name, store, index, termVector, length, pointer);
        f.setOmitNorms(fi.omitNorms);
      }
      doc.add(f);
    }

  }

  // in merge mode we don't uncompress the data of a compressed field
  private void addFieldForMerge(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws IOException {
    Object data;
      
    if (binary || compressed) {
      int toRead = fieldsStream.readVInt();
      final byte[] b = new byte[toRead];
      fieldsStream.readBytes(b, 0, b.length);
      data = b;
    } else {
      data = fieldsStream.readString();
    }
      
    doc.add(new FieldForMerge(data, fi, binary, compressed, tokenize));
  }
  
  private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) throws CorruptIndexException, IOException {

    //we have a binary stored field, and it may be compressed
    if (binary) {
      int toRead = fieldsStream.readVInt();
      final byte[] b = new byte[toRead];
      fieldsStream.readBytes(b, 0, b.length);
      if (compressed)
        doc.add(new Field(fi.name, uncompress(b), Field.Store.COMPRESS));
      else
        doc.add(new Field(fi.name, b, Field.Store.YES));

    } else {
      Field.Store store = Field.Store.YES;
      Field.Index index = getIndexType(fi, tokenize);
      Field.TermVector termVector = getTermVectorType(fi);

      Fieldable f;
      if (compressed) {
        store = Field.Store.COMPRESS;
        int toRead = fieldsStream.readVInt();

        final byte[] b = new byte[toRead];
        fieldsStream.readBytes(b, 0, b.length);
        f = new Field(fi.name,      // field name
                new String(uncompress(b), "UTF-8"), // uncompress the value and add as string
                store,
                index,
                termVector);
        f.setOmitNorms(fi.omitNorms);
      } else {
        f = new Field(fi.name,     // name
                fieldsStream.readString(), // read value
                store,
                index,
                termVector);
        f.setOmitNorms(fi.omitNorms);
      }
      doc.add(f);
    }
  }
  
  // Add the size of field as a byte[] containing the 4 bytes of the integer byte size (high order byte first; char = 2 bytes)
  // Read just the size -- caller must skip the field content to continue reading fields
  // Return the size in bytes or chars, depending on field type
  private int addFieldSize(Document doc, FieldInfo fi, boolean binary, boolean compressed) throws IOException {
    int size = fieldsStream.readVInt(), bytesize = binary || compressed ? size : 2*size;
    byte[] sizebytes = new byte[4];
    sizebytes[0] = (byte) (bytesize>>>24);
    sizebytes[1] = (byte) (bytesize>>>16);
    sizebytes[2] = (byte) (bytesize>>> 8);
    sizebytes[3] = (byte)  bytesize      ;
    doc.add(new Field(fi.name, sizebytes, Field.Store.YES));
    return size;
  }

  private Field.TermVector getTermVectorType(FieldInfo fi) {
    Field.TermVector termVector = null;
    if (fi.storeTermVector) {
      if (fi.storeOffsetWithTermVector) {
        if (fi.storePositionWithTermVector) {
          termVector = Field.TermVector.WITH_POSITIONS_OFFSETS;
        } else {
          termVector = Field.TermVector.WITH_OFFSETS;
        }
      } else if (fi.storePositionWithTermVector) {
        termVector = Field.TermVector.WITH_POSITIONS;
      } else {
        termVector = Field.TermVector.YES;
      }
    } else {
      termVector = Field.TermVector.NO;
    }
    return termVector;
  }

  private Field.Index getIndexType(FieldInfo fi, boolean tokenize) {
    Field.Index index;
    if (fi.isIndexed && tokenize)
      index = Field.Index.TOKENIZED;
    else if (fi.isIndexed && !tokenize)
      index = Field.Index.UN_TOKENIZED;
    else
      index = Field.Index.NO;
    return index;
  }

  /**
   * A Lazy implementation of Fieldable that differs loading of fields until asked for, instead of when the Document is
   * loaded.
   */
  private class LazyField extends AbstractField implements Fieldable {
    private int toRead;
    private long pointer;

    public LazyField(String name, Field.Store store, int toRead, long pointer) {
      super(name, store, Field.Index.NO, Field.TermVector.NO);
      this.toRead = toRead;
      this.pointer = pointer;
      lazy = true;
    }

    public LazyField(String name, Field.Store store, Field.Index index, Field.TermVector termVector, int toRead, long pointer) {
      super(name, store, index, termVector);
      this.toRead = toRead;
      this.pointer = pointer;
      lazy = true;
    }

    private IndexInput getFieldStream() {
      IndexInput localFieldsStream = (IndexInput) fieldsStreamTL.get();
      if (localFieldsStream == null) {
        localFieldsStream = (IndexInput) cloneableFieldsStream.clone();
        fieldsStreamTL.set(localFieldsStream);
      }
      return localFieldsStream;
    }

    /** The value of the field in Binary, or null.  If null, the Reader value,
     * String value, or TokenStream value is used. Exactly one of stringValue(), 
     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
    public byte[] binaryValue() {
      ensureOpen();
      if (fieldsData == null) {
        final byte[] b = new byte[toRead];
        IndexInput localFieldsStream = getFieldStream();
        //Throw this IO Exception since IndexREader.document does so anyway, so probably not that big of a change for people
        //since they are already handling this exception when getting the document
        try {
          localFieldsStream.seek(pointer);
          localFieldsStream.readBytes(b, 0, b.length);
          if (isCompressed == true) {
            fieldsData = uncompress(b);
          } else {
            fieldsData = b;
          }
        } catch (IOException e) {
          throw new FieldReaderException(e);
        }
      }
      return fieldsData instanceof byte[] ? (byte[]) fieldsData : null;
    }

    /** The value of the field as a Reader, or null.  If null, the String value,
     * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
    public Reader readerValue() {
      ensureOpen();
      return fieldsData instanceof Reader ? (Reader) fieldsData : null;
    }

    /** The value of the field as a TokesStream, or null.  If null, the Reader value,
     * String value, or binary value is used. Exactly one of stringValue(), 
     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
    public TokenStream tokenStreamValue() {
      ensureOpen();
      return fieldsData instanceof TokenStream ? (TokenStream) fieldsData : null;
    }

    
    /** The value of the field as a String, or null.  If null, the Reader value,
     * binary value, or TokenStream value is used.  Exactly one of stringValue(), 
     * readerValue(), binaryValue(), and tokenStreamValue() must be set. */
    public String stringValue() {
      ensureOpen();
      if (fieldsData == null) {
        IndexInput localFieldsStream = getFieldStream();
        try {
          localFieldsStream.seek(pointer);
          if (isCompressed) {
            final byte[] b = new byte[toRead];
            localFieldsStream.readBytes(b, 0, b.length);
            fieldsData = new String(uncompress(b), "UTF-8");
          } else {
            //read in chars b/c we already know the length we need to read
            char[] chars = new char[toRead];
            localFieldsStream.readChars(chars, 0, toRead);
            fieldsData = new String(chars);
          }
        } catch (IOException e) {
          throw new FieldReaderException(e);
        }
      }
      return fieldsData instanceof String ? (String) fieldsData : null;
    }

    public long getPointer() {
      ensureOpen();
      return pointer;
    }

    public void setPointer(long pointer) {
      ensureOpen();
      this.pointer = pointer;
    }

    public int getToRead() {
      ensureOpen();
      return toRead;
    }

    public void setToRead(int toRead) {
      ensureOpen();
      this.toRead = toRead;
    }
  }

  private final byte[] uncompress(final byte[] input)
          throws CorruptIndexException, IOException {

    Inflater decompressor = new Inflater();
    decompressor.setInput(input);

    // Create an expandable byte array to hold the decompressed data
    ByteArrayOutputStream bos = new ByteArrayOutputStream(input.length);

    // Decompress the data
    byte[] buf = new byte[1024];
    while (!decompressor.finished()) {
      try {
        int count = decompressor.inflate(buf);
        bos.write(buf, 0, count);
      }
      catch (DataFormatException e) {
        // this will happen if the field is not compressed
        CorruptIndexException newException = new CorruptIndexException("field data are in wrong format: " + e.toString());
        newException.initCause(e);
        throw newException;
      }
    }
  
    decompressor.end();
    
    // Get the decompressed data
    return bos.toByteArray();
  }
  
  // Instances of this class hold field properties and data
  // for merge
  final static class FieldForMerge extends AbstractField {
    public String stringValue() {
      return (String) this.fieldsData;
    }

    public Reader readerValue() {
      // not needed for merge
      return null;
    }

    public byte[] binaryValue() {
      return (byte[]) this.fieldsData;
    }

    public TokenStream tokenStreamValue() {
      // not needed for merge
      return null;
    }
    
    public FieldForMerge(Object value, FieldInfo fi, boolean binary, boolean compressed, boolean tokenize) {
      this.isStored = true;  
      this.fieldsData = value;
      this.isCompressed = compressed;
      this.isBinary = binary;
      this.isTokenized = tokenize;

      this.name = fi.name.intern();
      this.isIndexed = fi.isIndexed;
      this.omitNorms = fi.omitNorms;          
      this.storeOffsetWithTermVector = fi.storeOffsetWithTermVector;
      this.storePositionWithTermVector = fi.storePositionWithTermVector;
      this.storeTermVector = fi.storeTermVector;            
    }
     
  }
}