DocumentWriterpublic final class DocumentWriter extends Object Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. |
Fields Summary |
---|
private Analyzer | analyzer | private Directory | directory | private Similarity | similarity | private FieldInfos | fieldInfos | private int | maxFieldLength | private int | termIndexInterval | private PrintStream | infoStream | private final Hashtable | postingTable | private int[] | fieldLengths | private int[] | fieldPositions | private int[] | fieldOffsets | private float[] | fieldBoosts | private BitSet | fieldStoresPayloads | private List | openTokenStreams | private final Term | termBuffer |
Constructors Summary |
---|
DocumentWriter(Directory directory, Analyzer analyzer, Similarity similarity, int maxFieldLength)This ctor used by test code only.
this.directory = directory;
this.analyzer = analyzer;
this.similarity = similarity;
this.maxFieldLength = maxFieldLength;
| DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer)
this.directory = directory;
this.analyzer = analyzer;
this.similarity = writer.getSimilarity();
this.maxFieldLength = writer.getMaxFieldLength();
this.termIndexInterval = writer.getTermIndexInterval();
|
Methods Summary |
---|
final void | addDocument(java.lang.String segment, org.apache.lucene.document.Document doc)
// create field infos
fieldInfos = new FieldInfos();
fieldInfos.add(doc);
// invert doc into postingTable
postingTable.clear(); // clear postingTable
fieldLengths = new int[fieldInfos.size()]; // init fieldLengths
fieldPositions = new int[fieldInfos.size()]; // init fieldPositions
fieldOffsets = new int[fieldInfos.size()]; // init fieldOffsets
fieldStoresPayloads = new BitSet(fieldInfos.size());
fieldBoosts = new float[fieldInfos.size()]; // init fieldBoosts
Arrays.fill(fieldBoosts, doc.getBoost());
try {
// Before we write the FieldInfos we invert the Document. The reason is that
// during invertion the TokenStreams of tokenized fields are being processed
// and we might encounter tokens that have payloads associated with them. In
// this case we have to update the FieldInfo of the particular field.
invertDocument(doc);
// sort postingTable into an array
Posting[] postings = sortPostingTable();
// write field infos
fieldInfos.write(directory, segment + ".fnm");
// write field values
FieldsWriter fieldsWriter =
new FieldsWriter(directory, segment, fieldInfos);
try {
fieldsWriter.addDocument(doc);
} finally {
fieldsWriter.close();
}
/*
for (int i = 0; i < postings.length; i++) {
Posting posting = postings[i];
System.out.print(posting.term);
System.out.print(" freq=" + posting.freq);
System.out.print(" pos=");
System.out.print(posting.positions[0]);
for (int j = 1; j < posting.freq; j++)
System.out.print("," + posting.positions[j]);
System.out.println("");
}
*/
// write postings
writePostings(postings, segment);
// write norms of indexed fields
writeNorms(segment);
} finally {
// close TokenStreams
IOException ex = null;
Iterator it = openTokenStreams.iterator();
while (it.hasNext()) {
try {
((TokenStream) it.next()).close();
} catch (IOException e) {
if (ex != null) {
ex = e;
}
}
}
openTokenStreams.clear();
if (ex != null) {
throw ex;
}
}
| private final void | addPosition(java.lang.String field, java.lang.String text, int position, org.apache.lucene.index.Payload payload, org.apache.lucene.index.TermVectorOffsetInfo offset) // avoid consing
termBuffer.set(field, text);
//System.out.println("Offset: " + offset);
Posting ti = (Posting) postingTable.get(termBuffer);
if (ti != null) { // word seen before
int freq = ti.freq;
if (ti.positions.length == freq) { // positions array is full
int[] newPositions = new int[freq * 2]; // double size
int[] positions = ti.positions;
System.arraycopy(positions, 0, newPositions, 0, freq);
ti.positions = newPositions;
if (ti.payloads != null) {
// the current field stores payloads
Payload[] newPayloads = new Payload[freq * 2]; // grow payloads array
Payload[] payloads = ti.payloads;
System.arraycopy(payloads, 0, newPayloads, 0, payloads.length);
ti.payloads = newPayloads;
}
}
ti.positions[freq] = position; // add new position
if (payload != null) {
if (ti.payloads == null) {
// lazily allocate payload array
ti.payloads = new Payload[ti.positions.length];
}
ti.payloads[freq] = payload;
}
if (offset != null) {
if (ti.offsets.length == freq){
TermVectorOffsetInfo [] newOffsets = new TermVectorOffsetInfo[freq*2];
TermVectorOffsetInfo [] offsets = ti.offsets;
System.arraycopy(offsets, 0, newOffsets, 0, freq);
ti.offsets = newOffsets;
}
ti.offsets[freq] = offset;
}
ti.freq = freq + 1; // update frequency
} else { // word not seen before
Term term = new Term(field, text, false);
postingTable.put(term, new Posting(term, position, payload, offset));
}
| int | getNumFields()
return fieldInfos.size();
| private final void | invertDocument(org.apache.lucene.document.Document doc)
// Tokenizes the fields of a document into Postings.
Iterator fieldIterator = doc.getFields().iterator();
while (fieldIterator.hasNext()) {
Fieldable field = (Fieldable) fieldIterator.next();
String fieldName = field.name();
int fieldNumber = fieldInfos.fieldNumber(fieldName);
int length = fieldLengths[fieldNumber]; // length of field
int position = fieldPositions[fieldNumber]; // position in field
if (length>0) position+=analyzer.getPositionIncrementGap(fieldName);
int offset = fieldOffsets[fieldNumber]; // offset field
if (field.isIndexed()) {
if (!field.isTokenized()) { // un-tokenized field
String stringValue = field.stringValue();
if(field.isStoreOffsetWithTermVector())
addPosition(fieldName, stringValue, position++, null, new TermVectorOffsetInfo(offset, offset + stringValue.length()));
else
addPosition(fieldName, stringValue, position++, null, null);
offset += stringValue.length();
length++;
} else
{ // tokenized field
TokenStream stream = field.tokenStreamValue();
// the field does not have a TokenStream,
// so we have to obtain one from the analyzer
if (stream == null) {
Reader reader; // find or make Reader
if (field.readerValue() != null)
reader = field.readerValue();
else if (field.stringValue() != null)
reader = new StringReader(field.stringValue());
else
throw new IllegalArgumentException
("field must have either String or Reader value");
// Tokenize field and add to postingTable
stream = analyzer.tokenStream(fieldName, reader);
}
// remember this TokenStream, we must close it later
openTokenStreams.add(stream);
// reset the TokenStream to the first token
stream.reset();
Token lastToken = null;
for (Token t = stream.next(); t != null; t = stream.next()) {
position += (t.getPositionIncrement() - 1);
Payload payload = t.getPayload();
if (payload != null) {
// enable payloads for this field
fieldStoresPayloads.set(fieldNumber);
}
TermVectorOffsetInfo termVectorOffsetInfo;
if (field.isStoreOffsetWithTermVector()) {
termVectorOffsetInfo = new TermVectorOffsetInfo(offset + t.startOffset(), offset + t.endOffset());
} else {
termVectorOffsetInfo = null;
}
addPosition(fieldName, t.termText(), position++, payload, termVectorOffsetInfo);
lastToken = t;
if (++length >= maxFieldLength) {
if (infoStream != null)
infoStream.println("maxFieldLength " +maxFieldLength+ " reached, ignoring following tokens");
break;
}
}
if(lastToken != null)
offset += lastToken.endOffset() + 1;
}
fieldLengths[fieldNumber] = length; // save field length
fieldPositions[fieldNumber] = position; // save field position
fieldBoosts[fieldNumber] *= field.getBoost();
fieldOffsets[fieldNumber] = offset;
}
}
// update fieldInfos for all fields that have one or more tokens with payloads
for (int i = fieldStoresPayloads.nextSetBit(0); i >= 0; i = fieldStoresPayloads.nextSetBit(i+1)) {
fieldInfos.fieldInfo(i).storePayloads = true;
}
| private static final void | quickSort(org.apache.lucene.index.Posting[] postings, int lo, int hi)
if (lo >= hi)
return;
int mid = (lo + hi) / 2;
if (postings[lo].term.compareTo(postings[mid].term) > 0) {
Posting tmp = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp;
}
if (postings[mid].term.compareTo(postings[hi].term) > 0) {
Posting tmp = postings[mid];
postings[mid] = postings[hi];
postings[hi] = tmp;
if (postings[lo].term.compareTo(postings[mid].term) > 0) {
Posting tmp2 = postings[lo];
postings[lo] = postings[mid];
postings[mid] = tmp2;
}
}
int left = lo + 1;
int right = hi - 1;
if (left >= right)
return;
Term partition = postings[mid].term;
for (; ;) {
while (postings[right].term.compareTo(partition) > 0)
--right;
while (left < right && postings[left].term.compareTo(partition) <= 0)
++left;
if (left < right) {
Posting tmp = postings[left];
postings[left] = postings[right];
postings[right] = tmp;
--right;
} else {
break;
}
}
quickSort(postings, lo, left);
quickSort(postings, left + 1, hi);
| void | setInfoStream(java.io.PrintStream infoStream)If non-null, a message will be printed to this if maxFieldLength is reached.
this.infoStream = infoStream;
| private final org.apache.lucene.index.Posting[] | sortPostingTable()
// copy postingTable into an array
Posting[] array = new Posting[postingTable.size()];
Enumeration postings = postingTable.elements();
for (int i = 0; postings.hasMoreElements(); i++)
array[i] = (Posting) postings.nextElement();
// sort the array
quickSort(array, 0, array.length - 1);
return array;
| private final void | writeNorms(java.lang.String segment)
for(int n = 0; n < fieldInfos.size(); n++){
FieldInfo fi = fieldInfos.fieldInfo(n);
if(fi.isIndexed && !fi.omitNorms){
float norm = fieldBoosts[n] * similarity.lengthNorm(fi.name, fieldLengths[n]);
IndexOutput norms = directory.createOutput(segment + ".f" + n);
try {
norms.writeByte(Similarity.encodeNorm(norm));
} finally {
norms.close();
}
}
}
| private final void | writePostings(org.apache.lucene.index.Posting[] postings, java.lang.String segment)
IndexOutput freq = null, prox = null;
TermInfosWriter tis = null;
TermVectorsWriter termVectorWriter = null;
try {
//open files for inverse index storage
freq = directory.createOutput(segment + ".frq");
prox = directory.createOutput(segment + ".prx");
tis = new TermInfosWriter(directory, segment, fieldInfos,
termIndexInterval);
TermInfo ti = new TermInfo();
String currentField = null;
boolean currentFieldHasPayloads = false;
for (int i = 0; i < postings.length; i++) {
Posting posting = postings[i];
// check to see if we switched to a new field
String termField = posting.term.field();
if (currentField != termField) {
// changing field - see if there is something to save
currentField = termField;
FieldInfo fi = fieldInfos.fieldInfo(currentField);
currentFieldHasPayloads = fi.storePayloads;
if (fi.storeTermVector) {
if (termVectorWriter == null) {
termVectorWriter =
new TermVectorsWriter(directory, segment, fieldInfos);
termVectorWriter.openDocument();
}
termVectorWriter.openField(currentField);
} else if (termVectorWriter != null) {
termVectorWriter.closeField();
}
}
// add an entry to the dictionary with pointers to prox and freq files
ti.set(1, freq.getFilePointer(), prox.getFilePointer(), -1);
tis.add(posting.term, ti);
// add an entry to the freq file
int postingFreq = posting.freq;
if (postingFreq == 1) // optimize freq=1
freq.writeVInt(1); // set low bit of doc num.
else {
freq.writeVInt(0); // the document number
freq.writeVInt(postingFreq); // frequency in doc
}
int lastPosition = 0; // write positions
int[] positions = posting.positions;
Payload[] payloads = posting.payloads;
int lastPayloadLength = -1;
// The following encoding is being used for positions and payloads:
// Case 1: current field does not store payloads
// Positions -> <PositionDelta>^freq
// PositionDelta -> VInt
// The PositionDelta is the difference between the current
// and the previous position
// Case 2: current field stores payloads
// Positions -> <PositionDelta, Payload>^freq
// Payload -> <PayloadLength?, PayloadData>
// PositionDelta -> VInt
// PayloadLength -> VInt
// PayloadData -> byte^PayloadLength
// In this case PositionDelta/2 is the difference between
// the current and the previous position. If PositionDelta
// is odd, then a PayloadLength encoded as VInt follows,
// if PositionDelta is even, then it is assumed that the
// length of the current Payload equals the length of the
// previous Payload.
for (int j = 0; j < postingFreq; j++) { // use delta-encoding
int position = positions[j];
int delta = position - lastPosition;
if (currentFieldHasPayloads) {
int payloadLength = 0;
Payload payload = null;
if (payloads != null) {
payload = payloads[j];
if (payload != null) {
payloadLength = payload.length;
}
}
if (payloadLength == lastPayloadLength) {
// the length of the current payload equals the length
// of the previous one. So we do not have to store the length
// again and we only shift the position delta by one bit
prox.writeVInt(delta * 2);
} else {
// the length of the current payload is different from the
// previous one. We shift the position delta, set the lowest
// bit and store the current payload length as VInt.
prox.writeVInt(delta * 2 + 1);
prox.writeVInt(payloadLength);
lastPayloadLength = payloadLength;
}
if (payloadLength > 0) {
// write current payload
prox.writeBytes(payload.data, payload.offset, payload.length);
}
} else {
// field does not store payloads, just write position delta as VInt
prox.writeVInt(delta);
}
lastPosition = position;
}
if (termVectorWriter != null && termVectorWriter.isFieldOpen()) {
termVectorWriter.addTerm(posting.term.text(), postingFreq, posting.positions, posting.offsets);
}
}
if (termVectorWriter != null)
termVectorWriter.closeDocument();
} finally {
// make an effort to close all streams we can but remember and re-throw
// the first exception encountered in this process
IOException keep = null;
if (freq != null) try { freq.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (prox != null) try { prox.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (tis != null) try { tis.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (termVectorWriter != null) try { termVectorWriter.close(); } catch (IOException e) { if (keep == null) keep = e; }
if (keep != null) throw (IOException) keep.fillInStackTrace();
}
|
|