FileDocCategorySizeDatePackage
DocumentWriter.javaAPI DocApache Lucene 2.2.020319Sat Jun 16 22:20:36 BST 2007org.apache.lucene.index

DocumentWriter

public final class DocumentWriter extends Object
Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

Fields Summary
private Analyzer
analyzer
private Directory
directory
private Similarity
similarity
private FieldInfos
fieldInfos
private int
maxFieldLength
private int
termIndexInterval
private PrintStream
infoStream
private final Hashtable
postingTable
private int[]
fieldLengths
private int[]
fieldPositions
private int[]
fieldOffsets
private float[]
fieldBoosts
private BitSet
fieldStoresPayloads
private List
openTokenStreams
private final Term
termBuffer
Constructors Summary
DocumentWriter(Directory directory, Analyzer analyzer, Similarity similarity, int maxFieldLength)
This ctor used by test code only.

param
directory The directory to write the document information to
param
analyzer The analyzer to use for the document
param
similarity The Similarity function
param
maxFieldLength The maximum number of tokens a field may have


                                                
     
                     
    this.directory = directory;
    this.analyzer = analyzer;
    this.similarity = similarity;
    this.maxFieldLength = maxFieldLength;
  
DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer)

    this.directory = directory;
    this.analyzer = analyzer;
    this.similarity = writer.getSimilarity();
    this.maxFieldLength = writer.getMaxFieldLength();
    this.termIndexInterval = writer.getTermIndexInterval();
  
Methods Summary
final voidaddDocument(java.lang.String segment, org.apache.lucene.document.Document doc)

    // create field infos
    fieldInfos = new FieldInfos();
    fieldInfos.add(doc);
    
    // invert doc into postingTable
    postingTable.clear();			  // clear postingTable
    fieldLengths = new int[fieldInfos.size()];    // init fieldLengths
    fieldPositions = new int[fieldInfos.size()];  // init fieldPositions
    fieldOffsets = new int[fieldInfos.size()];    // init fieldOffsets
    fieldStoresPayloads = new BitSet(fieldInfos.size());
    
    fieldBoosts = new float[fieldInfos.size()];	  // init fieldBoosts
    Arrays.fill(fieldBoosts, doc.getBoost());

    try {
    
      // Before we write the FieldInfos we invert the Document. The reason is that
      // during invertion the TokenStreams of tokenized fields are being processed 
      // and we might encounter tokens that have payloads associated with them. In 
      // this case we have to update the FieldInfo of the particular field.
      invertDocument(doc);
    
      // sort postingTable into an array
      Posting[] postings = sortPostingTable();
    
      // write field infos 
      fieldInfos.write(directory, segment + ".fnm");

      // write field values
      FieldsWriter fieldsWriter =
        new FieldsWriter(directory, segment, fieldInfos);
      try {
        fieldsWriter.addDocument(doc);
      } finally {
        fieldsWriter.close();
      }
    
      /*
      for (int i = 0; i < postings.length; i++) {
        Posting posting = postings[i];
        System.out.print(posting.term);
        System.out.print(" freq=" + posting.freq);
        System.out.print(" pos=");
        System.out.print(posting.positions[0]);
        for (int j = 1; j < posting.freq; j++)
	  System.out.print("," + posting.positions[j]);
        System.out.println("");
      }
       */

      // write postings
      writePostings(postings, segment);

      // write norms of indexed fields
      writeNorms(segment);
    } finally {
      // close TokenStreams
      IOException ex = null;
      
      Iterator it = openTokenStreams.iterator();
      while (it.hasNext()) {
        try {
          ((TokenStream) it.next()).close();
        } catch (IOException e) {
          if (ex != null) {
            ex = e;
          }
        }
      }
      openTokenStreams.clear();
      
      if (ex != null) {
        throw ex;
      }
    }
  
private final voidaddPosition(java.lang.String field, java.lang.String text, int position, org.apache.lucene.index.Payload payload, org.apache.lucene.index.TermVectorOffsetInfo offset)

 // avoid consing

               
    termBuffer.set(field, text);
    //System.out.println("Offset: " + offset);
    Posting ti = (Posting) postingTable.get(termBuffer);
    if (ti != null) {				  // word seen before
      int freq = ti.freq;
      if (ti.positions.length == freq) {	  // positions array is full
        int[] newPositions = new int[freq * 2];	  // double size
        int[] positions = ti.positions;
        System.arraycopy(positions, 0, newPositions, 0, freq);
        ti.positions = newPositions;
        
        if (ti.payloads != null) {
          // the current field stores payloads
          Payload[] newPayloads = new Payload[freq * 2];  // grow payloads array
          Payload[] payloads = ti.payloads;
          System.arraycopy(payloads, 0, newPayloads, 0, payloads.length);
          ti.payloads = newPayloads;
        }
      }
      ti.positions[freq] = position;		  // add new position

      if (payload != null) {
        if (ti.payloads == null) {
          // lazily allocate payload array
          ti.payloads = new Payload[ti.positions.length];
        }
        ti.payloads[freq] = payload;
      }
      
      if (offset != null) {
        if (ti.offsets.length == freq){
          TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2];
          TermVectorOffsetInfo [] offsets = ti.offsets;
          System.arraycopy(offsets, 0, newOffsets, 0, freq);
          ti.offsets = newOffsets;
        }
        ti.offsets[freq] = offset;
      }
      ti.freq = freq + 1;			  // update frequency
    } else {					  // word not seen before
      Term term = new Term(field, text, false);
      postingTable.put(term, new Posting(term, position, payload, offset));
    }
  
intgetNumFields()

    return fieldInfos.size();
  
private final voidinvertDocument(org.apache.lucene.document.Document doc)


  // Tokenizes the fields of a document into Postings.
      
            
    Iterator fieldIterator = doc.getFields().iterator();
    while (fieldIterator.hasNext()) {
      Fieldable field = (Fieldable) fieldIterator.next();
      String fieldName = field.name();
      int fieldNumber = fieldInfos.fieldNumber(fieldName);

      int length = fieldLengths[fieldNumber];     // length of field
      int position = fieldPositions[fieldNumber]; // position in field
      if (length>0) position+=analyzer.getPositionIncrementGap(fieldName);
      int offset = fieldOffsets[fieldNumber];       // offset field

      if (field.isIndexed()) {
        if (!field.isTokenized()) {		  // un-tokenized field
          String stringValue = field.stringValue();
          if(field.isStoreOffsetWithTermVector())
            addPosition(fieldName, stringValue, position++, null, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
          else
            addPosition(fieldName, stringValue, position++, null, null);
          offset += stringValue.length();
          length++;
        } else 
        { // tokenized field
          TokenStream stream = field.tokenStreamValue();
          
          // the field does not have a TokenStream,
          // so we have to obtain one from the analyzer
          if (stream == null) {
            Reader reader;			  // find or make Reader
            if (field.readerValue() != null)
              reader = field.readerValue();
            else if (field.stringValue() != null)
              reader = new StringReader(field.stringValue());
            else
              throw new IllegalArgumentException
                      ("field must have either String or Reader value");
  
            // Tokenize field and add to postingTable
            stream = analyzer.tokenStream(fieldName, reader);
          }
          
          // remember this TokenStream, we must close it later
          openTokenStreams.add(stream);
          
          // reset the TokenStream to the first token
          stream.reset();
          

          Token lastToken = null;
          for (Token t = stream.next(); t != null; t = stream.next()) {
            position += (t.getPositionIncrement() - 1);
              
            Payload payload = t.getPayload();
            if (payload != null) {
              // enable payloads for this field
              fieldStoresPayloads.set(fieldNumber);
            }
              
            TermVectorOffsetInfo termVectorOffsetInfo;
            if (field.isStoreOffsetWithTermVector()) {
              termVectorOffsetInfo = new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset());
            } else {
              termVectorOffsetInfo = null;
            }
            addPosition(fieldName, t.termText(), position++, payload, termVectorOffsetInfo);
              
            lastToken = t;
            if (++length >= maxFieldLength) {
              if (infoStream != null)
                infoStream.println("maxFieldLength " +maxFieldLength+ " reached, ignoring following tokens");
              break;
            }
          }
            
          if(lastToken != null)
            offset += lastToken.endOffset() + 1;
        }

        fieldLengths[fieldNumber] = length;	  // save field length
        fieldPositions[fieldNumber] = position;	  // save field position
        fieldBoosts[fieldNumber] *= field.getBoost();
        fieldOffsets[fieldNumber] = offset;
      }
    }
    
    // update fieldInfos for all fields that have one or more tokens with payloads
    for (int i = fieldStoresPayloads.nextSetBit(0); i >= 0; i = fieldStoresPayloads.nextSetBit(i+1)) { 
    	fieldInfos.fieldInfo(i).storePayloads = true;
    }
  
private static final voidquickSort(org.apache.lucene.index.Posting[] postings, int lo, int hi)

    if (lo >= hi)
      return;

    int mid = (lo + hi) / 2;

    if (postings[lo].term.compareTo(postings[mid].term) > 0) {
      Posting tmp = postings[lo];
      postings[lo] = postings[mid];
      postings[mid] = tmp;
    }

    if (postings[mid].term.compareTo(postings[hi].term) > 0) {
      Posting tmp = postings[mid];
      postings[mid] = postings[hi];
      postings[hi] = tmp;

      if (postings[lo].term.compareTo(postings[mid].term) > 0) {
        Posting tmp2 = postings[lo];
        postings[lo] = postings[mid];
        postings[mid] = tmp2;
      }
    }

    int left = lo + 1;
    int right = hi - 1;

    if (left >= right)
      return;

    Term partition = postings[mid].term;

    for (; ;) {
      while (postings[right].term.compareTo(partition) > 0)
        --right;

      while (left < right && postings[left].term.compareTo(partition) <= 0)
        ++left;

      if (left < right) {
        Posting tmp = postings[left];
        postings[left] = postings[right];
        postings[right] = tmp;
        --right;
      } else {
        break;
      }
    }

    quickSort(postings, lo, left);
    quickSort(postings, left + 1, hi);
  
voidsetInfoStream(java.io.PrintStream infoStream)
If non-null, a message will be printed to this if maxFieldLength is reached.

    this.infoStream = infoStream;
  
private final org.apache.lucene.index.Posting[]sortPostingTable()

    // copy postingTable into an array
    Posting[] array = new Posting[postingTable.size()];
    Enumeration postings = postingTable.elements();
    for (int i = 0; postings.hasMoreElements(); i++)
      array[i] = (Posting) postings.nextElement();

    // sort the array
    quickSort(array, 0, array.length - 1);

    return array;
  
private final voidwriteNorms(java.lang.String segment)

 
    for(int n = 0; n < fieldInfos.size(); n++){
      FieldInfo fi = fieldInfos.fieldInfo(n);
      if(fi.isIndexed && !fi.omitNorms){
        float norm = fieldBoosts[n] * similarity.lengthNorm(fi.name, fieldLengths[n]);
        IndexOutput norms = directory.createOutput(segment + ".f" + n);
        try {
          norms.writeByte(Similarity.encodeNorm(norm));
        } finally {
          norms.close();
        }
      }
    }
  
private final voidwritePostings(org.apache.lucene.index.Posting[] postings, java.lang.String segment)

    IndexOutput freq = null, prox = null;
    TermInfosWriter tis = null;
    TermVectorsWriter termVectorWriter = null;
    try {
      //open files for inverse index storage
      freq = directory.createOutput(segment + ".frq");
      prox = directory.createOutput(segment + ".prx");
      tis = new TermInfosWriter(directory, segment, fieldInfos,
                                termIndexInterval);
      TermInfo ti = new TermInfo();
      String currentField = null;
      boolean currentFieldHasPayloads = false;
      
      for (int i = 0; i < postings.length; i++) {
        Posting posting = postings[i];

        // check to see if we switched to a new field
        String termField = posting.term.field();
        if (currentField != termField) {
          // changing field - see if there is something to save
          currentField = termField;
          FieldInfo fi = fieldInfos.fieldInfo(currentField);
          currentFieldHasPayloads = fi.storePayloads;
          if (fi.storeTermVector) {
            if (termVectorWriter == null) {
              termVectorWriter =
                new TermVectorsWriter(directory, segment, fieldInfos);
              termVectorWriter.openDocument();
            }
            termVectorWriter.openField(currentField);

          } else if (termVectorWriter != null) {
            termVectorWriter.closeField();
          }
        }
        
        // add an entry to the dictionary with pointers to prox and freq files
        ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
        tis.add(posting.term, ti);

        // add an entry to the freq file
        int postingFreq = posting.freq;
        if (postingFreq == 1)				  // optimize freq=1
          freq.writeVInt(1);			  // set low bit of doc num.
        else {
          freq.writeVInt(0);			  // the document number
          freq.writeVInt(postingFreq);			  // frequency in doc
        }

        int lastPosition = 0;			  // write positions
        int[] positions = posting.positions;
        Payload[] payloads = posting.payloads;
        int lastPayloadLength = -1;
        
        
        // The following encoding is being used for positions and payloads:
        // Case 1: current field does not store payloads
        //           Positions     -> <PositionDelta>^freq
        //           PositionDelta -> VInt
        //         The PositionDelta is the difference between the current
        //         and the previous position
        // Case 2: current field stores payloads
        //           Positions     -> <PositionDelta, Payload>^freq
        //           Payload       ->  <PayloadLength?, PayloadData>
        //           PositionDelta -> VInt
        //           PayloadLength -> VInt
        //           PayloadData   -> byte^PayloadLength
        //         In this case PositionDelta/2 is the difference between
        //         the current and the previous position. If PositionDelta
        //         is odd, then a PayloadLength encoded as VInt follows,
        //         if PositionDelta is even, then it is assumed that the
        //         length of the current Payload equals the length of the
        //         previous Payload.        
        for (int j = 0; j < postingFreq; j++) {		  // use delta-encoding
          int position = positions[j];
          int delta = position - lastPosition;
          if (currentFieldHasPayloads) {
            int payloadLength = 0;
            Payload payload = null;
            if (payloads != null) {
              payload = payloads[j];
              if (payload != null) {
                payloadLength = payload.length;
              }
            }
            if (payloadLength == lastPayloadLength) {
            	// the length of the current payload equals the length
            	// of the previous one. So we do not have to store the length
            	// again and we only shift the position delta by one bit
              prox.writeVInt(delta * 2);
            } else {
            	// the length of the current payload is different from the
            	// previous one. We shift the position delta, set the lowest
            	// bit and store the current payload length as VInt.
              prox.writeVInt(delta * 2 + 1);
              prox.writeVInt(payloadLength);
              lastPayloadLength = payloadLength;
            }
            if (payloadLength > 0) {
            	// write current payload
              prox.writeBytes(payload.data, payload.offset, payload.length);
            }
          } else {
          	// field does not store payloads, just write position delta as VInt
            prox.writeVInt(delta);
          }
          lastPosition = position;
        }
        if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
            termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
        }
      }
      if (termVectorWriter != null)
        termVectorWriter.closeDocument();
    } finally {
      // make an effort to close all streams we can but remember and re-throw
      // the first exception encountered in this process
      IOException keep = null;
      if (freq != null) try { freq.close(); } catch (IOException e) { if (keep == null) keep = e; }
      if (prox != null) try { prox.close(); } catch (IOException e) { if (keep == null) keep = e; }
      if (tis  != null) try {  tis.close(); } catch (IOException e) { if (keep == null) keep = e; }
      if (termVectorWriter  != null) try {  termVectorWriter.close(); } catch (IOException e) { if (keep == null) keep = e; }
      if (keep != null) throw (IOException) keep.fillInStackTrace();
    }