File Doc Category Size Date Package
HDFObjectFactory.java API Doc Apache Poi 3.0.1 21600 Mon Jan 01 18:55:22 GMT 2007 org.apache.poi.hdf.model

HDFObjectFactory

java.lang.Object

public class HDFObjectFactory extends Object

The Object Factory takes in a stream and creates the low level objects that represent the data.

author: andy

Fields Summary
private POIFSFileSystem
_filesystem
OLE stuff
private FileInformationBlock
_fib
The FIB
private HDFLowLevelParsingListener
_listener
Used to set up the object model
private ParsingState
_charParsingState
parsing state for characters
private ParsingState
_parParsingState
parsing state for paragraphs
byte[]
_mainDocument
main document stream buffer
byte[]
_tableBuffer
table stream buffer
Constructors Summary
protected HDFObjectFactory(InputStream istream, HDFLowLevelParsingListener l)
Creates a new instance of HDFObjectFactory
param
istream The InputStream that is the Word document
if (l == null) { _listener = new HDFObjectModel(); } else { _listener = l; } //do Ole stuff _filesystem = new POIFSFileSystem(istream); DocumentEntry headerProps = (DocumentEntry)_filesystem.getRoot().getEntry("WordDocument"); _mainDocument = new byte[headerProps.getSize()]; _filesystem.createDocumentInputStream("WordDocument").read(_mainDocument); _fib = new FileInformationBlock(_mainDocument); initTableStream(); initTextPieces(); initFormattingProperties();
public HDFObjectFactory(InputStream istream)
Creates a new instance of HDFObjectFactory
param
istream The InputStream that is the Word document
this(istream, null);
Methods Summary
private void createFontTable()
Initializes this document's FontTable;
int fontTableIndex = _fib.getFcSttbfffn(); int fontTableSize = _fib.getLcbSttbfffn(); byte[] fontTable = new byte[fontTableSize]; System.arraycopy(_tableBuffer, fontTableIndex, fontTable, 0, fontTableSize); _listener.fonts(new FontTable(fontTable));
private void createListTables()
Initializes the list tables for this document
int lfoOffset = _fib.getFcPlfLfo(); int lfoSize = _fib.getLcbPlfLfo(); byte[] plflfo = new byte[lfoSize]; System.arraycopy(_tableBuffer, lfoOffset, plflfo, 0, lfoSize); int lstOffset = _fib.getFcPlcfLst(); int lstSize = _fib.getLcbPlcfLst(); if (lstOffset > 0 && lstSize > 0) { // The lstSize returned by _fib.getLcbPlcfLst() doesn't appear // to take into account any LVLs. Therefore, we recalculate // lstSize based on where the LFO section begins (because the // LFO section immediately follows the LST section). lstSize = lfoOffset - lstOffset; byte[] plcflst = new byte[lstSize]; System.arraycopy(_tableBuffer, lstOffset, plcflst, 0, lstSize); _listener.lists(new ListTables(plcflst, plflfo)); }
private void createStyleSheet()
Uncompresses the StyleSheet from file into memory.
int stshIndex = _fib.getFcStshf(); int stshSize = _fib.getLcbStshf(); byte[] stsh = new byte[stshSize]; System.arraycopy(_tableBuffer, stshIndex, stsh, 0, stshSize); _listener.styleSheet(new StyleSheet(stsh));
public static java.util.List getTypes(java.io.InputStream istream)
List results = new ArrayList(1); //do Ole stuff POIFSFileSystem filesystem = new POIFSFileSystem(istream); DocumentEntry headerProps = (DocumentEntry)filesystem.getRoot().getEntry("WordDocument"); byte[] mainDocument = new byte[headerProps.getSize()]; filesystem.createDocumentInputStream("WordDocument").read(mainDocument); FileInformationBlock fib = new FileInformationBlock(mainDocument); results.add(fib); return results;
private void initCharacterProperties(int charOffset, org.apache.poi.hdf.model.hdftypes.PlexOfCps charPlcf, int start, int end)
//Initialize paragraph property stuff //int currentCharPage = _charParsingState.getCurrentPage(); int charPlcfLen = charPlcf.length(); int currentPageIndex = _charParsingState.getCurrentPageIndex(); FormattedDiskPage fkp = _charParsingState.getFkp(); int currentChpxIndex = _charParsingState.getCurrentPropIndex(); int currentArraySize = fkp.size(); //get the character runs for this paragraph int charStart = 0; int charEnd = 0; //add the character runs do { if (currentChpxIndex < currentArraySize) { charStart = fkp.getStart(currentChpxIndex); charEnd = fkp.getEnd(currentChpxIndex); byte[] chpx = fkp.getGrpprl(currentChpxIndex); _listener.characterRun(new ChpxNode(Math.max(charStart, start), Math.min(charEnd, end), chpx)); if (charEnd < end) { currentChpxIndex++; } else { _charParsingState.setState(currentPageIndex, fkp, currentChpxIndex); break; } } else { int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(++currentPageIndex)); byte[] byteFkp = new byte[512]; System.arraycopy(_mainDocument, (currentCharPage * 512), byteFkp, 0, 512); fkp = new CHPFormattedDiskPage(byteFkp); currentChpxIndex = 0; currentArraySize = fkp.size(); } } while(currentPageIndex < charPlcfLen);
private void initDocumentProperties()
Initializes the DocumentProperties object unique to this document.
int pos = _fib.getFcDop(); int size = _fib.getLcbDop(); byte[] dopArray = new byte[size]; System.arraycopy(_tableBuffer, pos, dopArray, 0, size); _listener.document(new DocumentProperties(dopArray));
private void initFormattingProperties()
initializes all of the formatting properties for a Word Document
createStyleSheet(); createListTables(); createFontTable(); initDocumentProperties(); initSectionProperties(); //initCharacterProperties(); //initParagraphProperties();
private void initParagraphProperties()
intializes the Paragraph Properties BTree
//paragraphs int parOffset = _fib.getFcPlcfbtePapx(); int parPlcSize = _fib.getLcbPlcfbtePapx(); //characters int charOffset = _fib.getFcPlcfbteChpx(); int charPlcSize = _fib.getLcbPlcfbteChpx(); PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4); PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4); //Initialize character property stuff int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(0)); int charPlcfLen = charPlcf.length(); int currentPageIndex = 0; byte[] fkp = new byte[512]; System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512); CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp); int currentChpxIndex = 0; int currentArraySize = cfkp.size(); int arraySize = parPlcf.length(); //first we must go through the bin table and find the fkps for(int x = 0; x < arraySize; x++) { int PN = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(x)); fkp = new byte[512]; System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512); PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp); //take each fkp and get the paps int crun = pfkp.size(); for(int y = 0; y < crun; y++) { //get the beginning fc of each paragraph text run int fcStart = pfkp.getStart(y); int fcEnd = pfkp.getEnd(y); //get the papx for this paragraph byte[] papx = pfkp.getGrpprl(y); _listener.paragraph(new PapxNode(fcStart, fcEnd, papx)); //get the character runs for this paragraph int charStart = 0; int charEnd = 0; //add the character runs do { if (currentChpxIndex < currentArraySize) { charStart = cfkp.getStart(currentChpxIndex); charEnd = cfkp.getEnd(currentChpxIndex); byte[] chpx = cfkp.getGrpprl(currentChpxIndex); _listener.characterRun(new ChpxNode(charStart, charEnd, chpx)); if (charEnd < fcEnd) { currentChpxIndex++; } else { break; } } else { currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(++currentPageIndex)); fkp = new byte[512]; System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512); cfkp = new CHPFormattedDiskPage(fkp); currentChpxIndex = 0; currentArraySize = cfkp.size(); } } while(currentCharPage <= charPlcfLen + 1); } }
private void initParagraphProperties(int parOffset, org.apache.poi.hdf.model.hdftypes.PlexOfCps parPlcf, int charOffset, org.apache.poi.hdf.model.hdftypes.PlexOfCps charPlcf, int start, int end)
//Initialize paragraph property stuff //int currentParPage = _parParsingState.getCurrentPage(); int parPlcfLen = parPlcf.length(); int currentPageIndex = _parParsingState.getCurrentPageIndex(); FormattedDiskPage fkp = _parParsingState.getFkp(); int currentPapxIndex = _parParsingState.getCurrentPropIndex(); int currentArraySize = fkp.size(); do { if (currentPapxIndex < currentArraySize) { int parStart = fkp.getStart(currentPapxIndex); int parEnd = fkp.getEnd(currentPapxIndex); byte[] papx = fkp.getGrpprl(currentPapxIndex); _listener.paragraph(new PapxNode(Math.max(parStart, start), Math.min(parEnd, end), papx)); initCharacterProperties(charOffset, charPlcf, Math.max(start, parStart), Math.min(parEnd, end)); if (parEnd < end) { currentPapxIndex++; } else { //save the state _parParsingState.setState(currentPageIndex, fkp, currentPapxIndex); break; } } else { int currentParPage = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(++currentPageIndex)); byte byteFkp[] = new byte[512]; System.arraycopy(_mainDocument, (currentParPage * 512), byteFkp, 0, 512); fkp = new PAPFormattedDiskPage(byteFkp); currentPapxIndex = 0; currentArraySize = fkp.size(); } } while(currentPageIndex < parPlcfLen);
private void initParsingStates(int parOffset, org.apache.poi.hdf.model.hdftypes.PlexOfCps parPlcf, int charOffset, org.apache.poi.hdf.model.hdftypes.PlexOfCps charPlcf)
int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(0)); byte[] fkp = new byte[512]; System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512); CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp); _charParsingState = new ParsingState(currentCharPage, cfkp); int currentParPage = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(0)); fkp = new byte[512]; System.arraycopy(_mainDocument, (currentParPage * 512), fkp, 0, 512); PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp); _parParsingState = new ParsingState(currentParPage, pfkp);
private void initSectionProperties()
initializes the SectionProperties BTree
int ccpText = _fib.getCcpText(); int ccpFtn = _fib.getCcpFtn(); //sections int fcMin = _fib.getFcMin(); int plcfsedFC = _fib.getFcPlcfsed(); int plcfsedSize = _fib.getLcbPlcfsed(); //paragraphs int parOffset = _fib.getFcPlcfbtePapx(); int parPlcSize = _fib.getLcbPlcfbtePapx(); //characters int charOffset = _fib.getFcPlcfbteChpx(); int charPlcSize = _fib.getLcbPlcfbteChpx(); PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4); PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4); initParsingStates(parOffset, parPlcf, charOffset, charPlcf); //byte[] plcfsed = new byte[plcfsedSize]; //System.arraycopy(_tableBuffer, plcfsedFC, plcfsed, 0, plcfsedSize); PlexOfCps plcfsed = new PlexOfCps(plcfsedSize, 12); int arraySize = plcfsed.length(); int start = fcMin; int end = fcMin + ccpText; int x = 0; int sectionEnd = 0; //do the main body sections while (x < arraySize) { int sectionStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x)) + fcMin; sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x + 1)) + fcMin; int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getStructOffset(x) + 2); int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart); byte[] sepx = new byte[sepxSize]; System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize); SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx); _listener.bodySection(node); initParagraphProperties(parOffset, parPlcf, charOffset, charPlcf, sectionStart, Math.min(end, sectionEnd)); if (sectionEnd > end) { break; } else { x++; } } //do the header sections for (; x < arraySize; x++)// && sectionEnd <= end; x++) { int sectionStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x)) + fcMin; sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x + 1)) + fcMin; int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getStructOffset(x) + 2); int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart); byte[] sepx = new byte[sepxSize]; System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize); SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx); _listener.hdrSection(node); initParagraphProperties(parOffset, parPlcf, charOffset, charPlcf, Math.max(sectionStart, end), sectionEnd); } _listener.endSections();
private void initTableStream()
Initializes the table stream
throws
IOException
String tablename = null; if(_fib.isFWhichTblStm()) { tablename="1Table"; } else { tablename="0Table"; } DocumentEntry tableEntry = (DocumentEntry)_filesystem.getRoot().getEntry(tablename); //load the table stream into a buffer int size = tableEntry.getSize(); _tableBuffer = new byte[size]; _filesystem.createDocumentInputStream(tablename).read(_tableBuffer);
private void initTextPieces()
Initializes the text pieces. Text is divided into pieces because some "pieces" may only contain unicode characters.
throws
IOException
int pos = _fib.getFcClx(); //skips through the prms before we reach the piece table. These contain data //for actual fast saved files while (_tableBuffer[pos] == 1) { pos++; int skip = LittleEndian.getShort(_tableBuffer, pos); pos += 2 + skip; } if(_tableBuffer[pos] != 2) { throw new IOException("The text piece table is corrupted"); } else { //parse out the text pieces int pieceTableSize = LittleEndian.getInt(_tableBuffer, ++pos); pos += 4; int pieces = (pieceTableSize - 4) / 12; for (int x = 0; x < pieces; x++) { int filePos = LittleEndian.getInt(_tableBuffer, pos + ((pieces + 1) * 4) + (x * 8) + 2); boolean unicode = false; if ((filePos & 0x40000000) == 0) { unicode = true; } else { unicode = false; filePos &= ~(0x40000000);//gives me FC in doc stream filePos /= 2; } int totLength = LittleEndian.getInt(_tableBuffer, pos + (x + 1) * 4) - LittleEndian.getInt(_tableBuffer, pos + (x * 4)); TextPiece piece = new TextPiece(filePos, totLength, unicode); _listener.text(piece); } }
public static void main(java.lang.String[] args)
try { HDFObjectFactory f = new HDFObjectFactory(new FileInputStream("c:\\test.doc")); int k = 0; } catch(Throwable t) { t.printStackTrace(); }