DocumentWriter.java (Apache Lucene 1.4.3)

File	Doc	Category	Size	Date	Package
DocumentWriter.java	API Doc	Apache Lucene 1.4.3	11783	Sat Jul 10 08:19:02 BST 2004	org.apache.lucene.index
DocumentWriter.java

package org.apache.lucene.index;

/**
 * Copyright 2004 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.util.Hashtable;
import java.util.Enumeration;
import java.util.Arrays;

import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.OutputStream;
import org.apache.lucene.search.Similarity;

final class DocumentWriter {
  private Analyzer analyzer;
  private Directory directory;
  private Similarity similarity;
  private FieldInfos fieldInfos;
  private int maxFieldLength;

  /**
   * 
   * @param directory The directory to write the document information to
   * @param analyzer The analyzer to use for the document
   * @param similarity The Similarity function
   * @param maxFieldLength The maximum number of tokens a field may have
   */ 
  DocumentWriter(Directory directory, Analyzer analyzer,
                 Similarity similarity, int maxFieldLength) {
    this.directory = directory;
    this.analyzer = analyzer;
    this.similarity = similarity;
    this.maxFieldLength = maxFieldLength;
  }

  final void addDocument(String segment, Document doc)
          throws IOException {
    // write field names
    fieldInfos = new FieldInfos();
    fieldInfos.add(doc);
    fieldInfos.write(directory, segment + ".fnm");

    // write field values
    FieldsWriter fieldsWriter =
            new FieldsWriter(directory, segment, fieldInfos);
    try {
      fieldsWriter.addDocument(doc);
    } finally {
      fieldsWriter.close();
    }

    // invert doc into postingTable
    postingTable.clear();			  // clear postingTable
    fieldLengths = new int[fieldInfos.size()];    // init fieldLengths
    fieldPositions = new int[fieldInfos.size()];  // init fieldPositions

    fieldBoosts = new float[fieldInfos.size()];	  // init fieldBoosts
    Arrays.fill(fieldBoosts, doc.getBoost());

    invertDocument(doc);

    // sort postingTable into an array
    Posting[] postings = sortPostingTable();

    /*
    for (int i = 0; i < postings.length; i++) {
      Posting posting = postings[i];
      System.out.print(posting.term);
      System.out.print(" freq=" + posting.freq);
      System.out.print(" pos=");
      System.out.print(posting.positions[0]);
      for (int j = 1; j < posting.freq; j++)
	System.out.print("," + posting.positions[j]);
      System.out.println("");
    }
    */

    // write postings
    writePostings(postings, segment);

    // write norms of indexed fields
    writeNorms(doc, segment);

  }

  // Keys are Terms, values are Postings.
  // Used to buffer a document before it is written to the index.
  private final Hashtable postingTable = new Hashtable();
  private int[] fieldLengths;
  private int[] fieldPositions;
  private float[] fieldBoosts;

  // Tokenizes the fields of a document into Postings.
  private final void invertDocument(Document doc)
          throws IOException {
    Enumeration fields = doc.fields();
    while (fields.hasMoreElements()) {
      Field field = (Field) fields.nextElement();
      String fieldName = field.name();
      int fieldNumber = fieldInfos.fieldNumber(fieldName);

      int length = fieldLengths[fieldNumber];     // length of field
      int position = fieldPositions[fieldNumber]; // position in field

      if (field.isIndexed()) {
        if (!field.isTokenized()) {		  // un-tokenized field
          addPosition(fieldName, field.stringValue(), position++);
          length++;
        } else {
          Reader reader;			  // find or make Reader
          if (field.readerValue() != null)
            reader = field.readerValue();
          else if (field.stringValue() != null)
            reader = new StringReader(field.stringValue());
          else
            throw new IllegalArgumentException
                    ("field must have either String or Reader value");

          // Tokenize field and add to postingTable
          TokenStream stream = analyzer.tokenStream(fieldName, reader);
          try {
            for (Token t = stream.next(); t != null; t = stream.next()) {
              position += (t.getPositionIncrement() - 1);
              addPosition(fieldName, t.termText(), position++);
              if (++length > maxFieldLength) break;
            }
          } finally {
            stream.close();
          }
        }

        fieldLengths[fieldNumber] = length;	  // save field length
        fieldPositions[fieldNumber] = position;	  // save field position
        fieldBoosts[fieldNumber] *= field.getBoost();
      }
    }
  }

  private final Term termBuffer = new Term("", ""); // avoid consing

  private final void addPosition(String field, String text, int position) {
    termBuffer.set(field, text);
    Posting ti = (Posting) postingTable.get(termBuffer);
    if (ti != null) {				  // word seen before
      int freq = ti.freq;
      if (ti.positions.length == freq) {	  // positions array is full
        int[] newPositions = new int[freq * 2];	  // double size
        int[] positions = ti.positions;
        for (int i = 0; i < freq; i++)		  // copy old positions to new
          newPositions[i] = positions[i];
        ti.positions = newPositions;
      }
      ti.positions[freq] = position;		  // add new position
      ti.freq = freq + 1;			  // update frequency
    } else {					  // word not seen before
      Term term = new Term(field, text, false);
      postingTable.put(term, new Posting(term, position));
    }
  }

  private final Posting[] sortPostingTable() {
    // copy postingTable into an array
    Posting[] array = new Posting[postingTable.size()];
    Enumeration postings = postingTable.elements();
    for (int i = 0; postings.hasMoreElements(); i++)
      array[i] = (Posting) postings.nextElement();

    // sort the array
    quickSort(array, 0, array.length - 1);

    return array;
  }

  private static final void quickSort(Posting[] postings, int lo, int hi) {
    if (lo >= hi)
      return;

    int mid = (lo + hi) / 2;

    if (postings[lo].term.compareTo(postings[mid].term) > 0) {
      Posting tmp = postings[lo];
      postings[lo] = postings[mid];
      postings[mid] = tmp;
    }

    if (postings[mid].term.compareTo(postings[hi].term) > 0) {
      Posting tmp = postings[mid];
      postings[mid] = postings[hi];
      postings[hi] = tmp;

      if (postings[lo].term.compareTo(postings[mid].term) > 0) {
        Posting tmp2 = postings[lo];
        postings[lo] = postings[mid];
        postings[mid] = tmp2;
      }
    }

    int left = lo + 1;
    int right = hi - 1;

    if (left >= right)
      return;

    Term partition = postings[mid].term;

    for (; ;) {
      while (postings[right].term.compareTo(partition) > 0)
        --right;

      while (left < right && postings[left].term.compareTo(partition) <= 0)
        ++left;

      if (left < right) {
        Posting tmp = postings[left];
        postings[left] = postings[right];
        postings[right] = tmp;
        --right;
      } else {
        break;
      }
    }

    quickSort(postings, lo, left);
    quickSort(postings, left + 1, hi);
  }

  private final void writePostings(Posting[] postings, String segment)
          throws IOException {
    OutputStream freq = null, prox = null;
    TermInfosWriter tis = null;
    TermVectorsWriter termVectorWriter = null;
    try {
      //open files for inverse index storage
      freq = directory.createFile(segment + ".frq");
      prox = directory.createFile(segment + ".prx");
      tis = new TermInfosWriter(directory, segment, fieldInfos);
      TermInfo ti = new TermInfo();
      String currentField = null;

      for (int i = 0; i < postings.length; i++) {
        Posting posting = postings[i];

        // add an entry to the dictionary with pointers to prox and freq files
        ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
        tis.add(posting.term, ti);

        // add an entry to the freq file
        int postingFreq = posting.freq;
        if (postingFreq == 1)				  // optimize freq=1
          freq.writeVInt(1);			  // set low bit of doc num.
        else {
          freq.writeVInt(0);			  // the document number
          freq.writeVInt(postingFreq);			  // frequency in doc
        }

        int lastPosition = 0;			  // write positions
        int[] positions = posting.positions;
        for (int j = 0; j < postingFreq; j++) {		  // use delta-encoding
          int position = positions[j];
          prox.writeVInt(position - lastPosition);
          lastPosition = position;
        }
        // check to see if we switched to a new field
        String termField = posting.term.field();
        if (currentField != termField) {
          // changing field - see if there is something to save
          currentField = termField;
          FieldInfo fi = fieldInfos.fieldInfo(currentField);
          if (fi.storeTermVector) {
            if (termVectorWriter == null) {
              termVectorWriter =
                new TermVectorsWriter(directory, segment, fieldInfos);
              termVectorWriter.openDocument();
            }
            termVectorWriter.openField(currentField);
          } else if (termVectorWriter != null) {
            termVectorWriter.closeField();
          }
        }
        if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
          termVectorWriter.addTerm(posting.term.text(), postingFreq);
        }
      }
      if (termVectorWriter != null)
        termVectorWriter.closeDocument();
    } finally {
      // make an effort to close all streams we can but remember and re-throw
      // the first exception encountered in this process
      IOException keep = null;
      if (freq != null) try { freq.close(); } catch (IOException e) { if (keep == null) keep = e; }
      if (prox != null) try { prox.close(); } catch (IOException e) { if (keep == null) keep = e; }
      if (tis  != null) try {  tis.close(); } catch (IOException e) { if (keep == null) keep = e; }
      if (termVectorWriter  != null) try {  termVectorWriter.close(); } catch (IOException e) { if (keep == null) keep = e; }
      if (keep != null) throw (IOException) keep.fillInStackTrace();
    }
  }

  private final void writeNorms(Document doc, String segment) throws IOException { 
    for(int n = 0; n < fieldInfos.size(); n++){
      FieldInfo fi = fieldInfos.fieldInfo(n);
      if(fi.isIndexed){
        float norm = fieldBoosts[n] * similarity.lengthNorm(fi.name, fieldLengths[n]);
        OutputStream norms = directory.createFile(segment + ".f" + n);
        try {
          norms.writeByte(Similarity.encodeNorm(norm));
        } finally {
          norms.close();
        }
      }
    }
  }
}

final class Posting {				  // info about a Term in a doc
  Term term;					  // the Term
  int freq;					  // its frequency in doc
  int[] positions;				  // positions it occurs at

  Posting(Term t, int position) {
    term = t;
    freq = 1;
    positions = new int[1];
    positions[0] = position;
  }
}