File Doc Category Size Date Package
DocumentWriter.java API Doc Apache Lucene 2.2.0 20319 Sat Jun 16 22:20:36 BST 2007 org.apache.lucene.index

DocumentWriter

java.lang.Object

public final class DocumentWriter extends Object

Licensed to the Apache Software Foundation (ASF) under one or more contributor license agreements. See the NOTICE file distributed with this work for additional information regarding copyright ownership. The ASF licenses this file to You under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

(Omit source code)

Fields Summary
private Analyzer
analyzer
private Directory
directory
private Similarity
similarity
private FieldInfos
fieldInfos
private int
maxFieldLength
private int
termIndexInterval
private PrintStream
infoStream
private final Hashtable
postingTable
private int[]
fieldLengths
private int[]
fieldPositions
private int[]
fieldOffsets
private float[]
fieldBoosts
private BitSet
fieldStoresPayloads
private List
openTokenStreams
private final Term
termBuffer
Constructors Summary
DocumentWriter(Directory directory, Analyzer analyzer, Similarity similarity, int maxFieldLength)
This ctor used by test code only.
param
directory The directory to write the document information to
param
analyzer The analyzer to use for the document
param
similarity The Similarity function
param
maxFieldLength The maximum number of tokens a field may have
this.directory = directory; this.analyzer = analyzer; this.similarity = similarity; this.maxFieldLength = maxFieldLength;
DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer)
this.directory = directory; this.analyzer = analyzer; this.similarity = writer.getSimilarity(); this.maxFieldLength = writer.getMaxFieldLength(); this.termIndexInterval = writer.getTermIndexInterval();
Methods Summary
final void addDocument(java.lang.String segment, org.apache.lucene.document.Document doc)
// create field infos fieldInfos = new FieldInfos(); fieldInfos.add(doc); // invert doc into postingTable postingTable.clear(); // clear postingTable fieldLengths = new int[fieldInfos.size()]; // init fieldLengths fieldPositions = new int[fieldInfos.size()]; // init fieldPositions fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets fieldStoresPayloads = new BitSet(fieldInfos.size()); fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts Arrays.fill(fieldBoosts, doc.getBoost()); try { // Before we write the FieldInfos we invert the Document. The reason is that // during invertion the TokenStreams of tokenized fields are being processed // and we might encounter tokens that have payloads associated with them. In // this case we have to update the FieldInfo of the particular field. invertDocument(doc); // sort postingTable into an array Posting[] postings = sortPostingTable(); // write field infos fieldInfos.write(directory, segment + ".fnm"); // write field values FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos); try { fieldsWriter.addDocument(doc); } finally { fieldsWriter.close(); } /* for (int i = 0; i < postings.length; i++) { Posting posting = postings[i]; System.out.print(posting.term); System.out.print(" freq=" + posting.freq); System.out.print(" pos="); System.out.print(posting.positions[0]); for (int j = 1; j < posting.freq; j++) System.out.print("," + posting.positions[j]); System.out.println(""); } */ // write postings writePostings(postings, segment); // write norms of indexed fields writeNorms(segment); } finally { // close TokenStreams IOException ex = null; Iterator it = openTokenStreams.iterator(); while (it.hasNext()) { try { ((TokenStream) it.next()).close(); } catch (IOException e) { if (ex != null) { ex = e; } } } openTokenStreams.clear(); if (ex != null) { throw ex; } }
private final void addPosition(java.lang.String field, java.lang.String text, int position, org.apache.lucene.index.Payload payload, org.apache.lucene.index.TermVectorOffsetInfo offset)
// avoid consing termBuffer.set(field, text); //System.out.println("Offset: " + offset); Posting ti = (Posting) postingTable.get(termBuffer); if (ti != null) { // word seen before int freq = ti.freq; if (ti.positions.length == freq) { // positions array is full int[] newPositions = new int[freq * 2]; // double size int[] positions = ti.positions; System.arraycopy(positions, 0, newPositions, 0, freq); ti.positions = newPositions; if (ti.payloads != null) { // the current field stores payloads Payload[] newPayloads = new Payload[freq * 2]; // grow payloads array Payload[] payloads = ti.payloads; System.arraycopy(payloads, 0, newPayloads, 0, payloads.length); ti.payloads = newPayloads; } } ti.positions[freq] = position; // add new position if (payload != null) { if (ti.payloads == null) { // lazily allocate payload array ti.payloads = new Payload[ti.positions.length]; } ti.payloads[freq] = payload; } if (offset != null) { if (ti.offsets.length == freq){ TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2]; TermVectorOffsetInfo [] offsets = ti.offsets; System.arraycopy(offsets, 0, newOffsets, 0, freq); ti.offsets = newOffsets; } ti.offsets[freq] = offset; } ti.freq = freq + 1; // update frequency } else { // word not seen before Term term = new Term(field, text, false); postingTable.put(term, new Posting(term, position, payload, offset)); }
int getNumFields()
return fieldInfos.size();
private final void invertDocument(org.apache.lucene.document.Document doc)
// Tokenizes the fields of a document into Postings. Iterator fieldIterator = doc.getFields().iterator(); while (fieldIterator.hasNext()) { Fieldable field = (Fieldable) fieldIterator.next(); String fieldName = field.name(); int fieldNumber = fieldInfos.fieldNumber(fieldName); int length = fieldLengths[fieldNumber]; // length of field int position = fieldPositions[fieldNumber]; // position in field if (length>0) position+=analyzer.getPositionIncrementGap(fieldName); int offset = fieldOffsets[fieldNumber]; // offset field if (field.isIndexed()) { if (!field.isTokenized()) { // un-tokenized field String stringValue = field.stringValue(); if(field.isStoreOffsetWithTermVector()) addPosition(fieldName, stringValue, position++, null, new TermVectorOffsetInfo(offset, offset + stringValue.length())); else addPosition(fieldName, stringValue, position++, null, null); offset += stringValue.length(); length++; } else { // tokenized field TokenStream stream = field.tokenStreamValue(); // the field does not have a TokenStream, // so we have to obtain one from the analyzer if (stream == null) { Reader reader; // find or make Reader if (field.readerValue() != null) reader = field.readerValue(); else if (field.stringValue() != null) reader = new StringReader(field.stringValue()); else throw new IllegalArgumentException ("field must have either String or Reader value"); // Tokenize field and add to postingTable stream = analyzer.tokenStream(fieldName, reader); } // remember this TokenStream, we must close it later openTokenStreams.add(stream); // reset the TokenStream to the first token stream.reset(); Token lastToken = null; for (Token t = stream.next(); t != null; t = stream.next()) { position += (t.getPositionIncrement() - 1); Payload payload = t.getPayload(); if (payload != null) { // enable payloads for this field fieldStoresPayloads.set(fieldNumber); } TermVectorOffsetInfo termVectorOffsetInfo; if (field.isStoreOffsetWithTermVector()) { termVectorOffsetInfo = new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset()); } else { termVectorOffsetInfo = null; } addPosition(fieldName, t.termText(), position++, payload, termVectorOffsetInfo); lastToken = t; if (++length >= maxFieldLength) { if (infoStream != null) infoStream.println("maxFieldLength " +maxFieldLength+ " reached, ignoring following tokens"); break; } } if(lastToken != null) offset += lastToken.endOffset() + 1; } fieldLengths[fieldNumber] = length; // save field length fieldPositions[fieldNumber] = position; // save field position fieldBoosts[fieldNumber] *= field.getBoost(); fieldOffsets[fieldNumber] = offset; } } // update fieldInfos for all fields that have one or more tokens with payloads for (int i = fieldStoresPayloads.nextSetBit(0); i >= 0; i = fieldStoresPayloads.nextSetBit(i+1)) { fieldInfos.fieldInfo(i).storePayloads = true; }
private static final void quickSort(org.apache.lucene.index.Posting[] postings, int lo, int hi)
if (lo >= hi) return; int mid = (lo + hi) / 2; if (postings[lo].term.compareTo(postings[mid].term) > 0) { Posting tmp = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp; } if (postings[mid].term.compareTo(postings[hi].term) > 0) { Posting tmp = postings[mid]; postings[mid] = postings[hi]; postings[hi] = tmp; if (postings[lo].term.compareTo(postings[mid].term) > 0) { Posting tmp2 = postings[lo]; postings[lo] = postings[mid]; postings[mid] = tmp2; } } int left = lo + 1; int right = hi - 1; if (left >= right) return; Term partition = postings[mid].term; for (; ;) { while (postings[right].term.compareTo(partition) > 0) --right; while (left < right && postings[left].term.compareTo(partition) <= 0) ++left; if (left < right) { Posting tmp = postings[left]; postings[left] = postings[right]; postings[right] = tmp; --right; } else { break; } } quickSort(postings, lo, left); quickSort(postings, left + 1, hi);
void setInfoStream(java.io.PrintStream infoStream)
If non-null, a message will be printed to this if maxFieldLength is reached.
this.infoStream = infoStream;
private final org.apache.lucene.index.Posting[] sortPostingTable()
// copy postingTable into an array Posting[] array = new Posting[postingTable.size()]; Enumeration postings = postingTable.elements(); for (int i = 0; postings.hasMoreElements(); i++) array[i] = (Posting) postings.nextElement(); // sort the array quickSort(array, 0, array.length - 1); return array;
private final void writeNorms(java.lang.String segment)
for(int n = 0; n < fieldInfos.size(); n++){ FieldInfo fi = fieldInfos.fieldInfo(n); if(fi.isIndexed && !fi.omitNorms){ float norm = fieldBoosts[n] * similarity.lengthNorm(fi.name, fieldLengths[n]); IndexOutput norms = directory.createOutput(segment + ".f" + n); try { norms.writeByte(Similarity.encodeNorm(norm)); } finally { norms.close(); } } }
private final void writePostings(org.apache.lucene.index.Posting[] postings, java.lang.String segment)
IndexOutput freq = null, prox = null; TermInfosWriter tis = null; TermVectorsWriter termVectorWriter = null; try { //open files for inverse index storage freq = directory.createOutput(segment + ".frq"); prox = directory.createOutput(segment + ".prx"); tis = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval); TermInfo ti = new TermInfo(); String currentField = null; boolean currentFieldHasPayloads = false; for (int i = 0; i < postings.length; i++) { Posting posting = postings[i]; // check to see if we switched to a new field String termField = posting.term.field(); if (currentField != termField) { // changing field - see if there is something to save currentField = termField; FieldInfo fi = fieldInfos.fieldInfo(currentField); currentFieldHasPayloads = fi.storePayloads; if (fi.storeTermVector) { if (termVectorWriter == null) { termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos); termVectorWriter.openDocument(); } termVectorWriter.openField(currentField); } else if (termVectorWriter != null) { termVectorWriter.closeField(); } } // add an entry to the dictionary with pointers to prox and freq files ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1); tis.add(posting.term, ti); // add an entry to the freq file int postingFreq = posting.freq; if (postingFreq == 1) // optimize freq=1 freq.writeVInt(1); // set low bit of doc num. else { freq.writeVInt(0); // the document number freq.writeVInt(postingFreq); // frequency in doc } int lastPosition = 0; // write positions int[] positions = posting.positions; Payload[] payloads = posting.payloads; int lastPayloadLength = -1; // The following encoding is being used for positions and payloads: // Case 1: current field does not store payloads // Positions -> <PositionDelta>^freq // PositionDelta -> VInt // The PositionDelta is the difference between the current // and the previous position // Case 2: current field stores payloads // Positions -> <PositionDelta, Payload>^freq // Payload -> <PayloadLength?, PayloadData> // PositionDelta -> VInt // PayloadLength -> VInt // PayloadData -> byte^PayloadLength // In this case PositionDelta/2 is the difference between // the current and the previous position. If PositionDelta // is odd, then a PayloadLength encoded as VInt follows, // if PositionDelta is even, then it is assumed that the // length of the current Payload equals the length of the // previous Payload. for (int j = 0; j < postingFreq; j++) { // use delta-encoding int position = positions[j]; int delta = position - lastPosition; if (currentFieldHasPayloads) { int payloadLength = 0; Payload payload = null; if (payloads != null) { payload = payloads[j]; if (payload != null) { payloadLength = payload.length; } } if (payloadLength == lastPayloadLength) { // the length of the current payload equals the length // of the previous one. So we do not have to store the length // again and we only shift the position delta by one bit prox.writeVInt(delta * 2); } else { // the length of the current payload is different from the // previous one. We shift the position delta, set the lowest // bit and store the current payload length as VInt. prox.writeVInt(delta * 2 + 1); prox.writeVInt(payloadLength); lastPayloadLength = payloadLength; } if (payloadLength > 0) { // write current payload prox.writeBytes(payload.data, payload.offset, payload.length); } } else { // field does not store payloads, just write position delta as VInt prox.writeVInt(delta); } lastPosition = position; } if (termVectorWriter != null && termVectorWriter.isFieldOpen()) { termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets); } } if (termVectorWriter != null) termVectorWriter.closeDocument(); } finally { // make an effort to close all streams we can but remember and re-throw // the first exception encountered in this process IOException keep = null; if (freq != null) try { freq.close(); } catch (IOException e) { if (keep == null) keep = e; } if (prox != null) try { prox.close(); } catch (IOException e) { if (keep == null) keep = e; } if (tis != null) try { tis.close(); } catch (IOException e) { if (keep == null) keep = e; } if (termVectorWriter != null) try { termVectorWriter.close(); } catch (IOException e) { if (keep == null) keep = e; } if (keep != null) throw (IOException) keep.fillInStackTrace(); }