HDFObjectFactorypublic class HDFObjectFactory extends Object The Object Factory takes in a stream and creates the low level objects
that represent the data. |
Fields Summary |
---|
private POIFSFileSystem | _filesystemOLE stuff | private FileInformationBlock | _fibThe FIB | private HDFLowLevelParsingListener | _listenerUsed to set up the object model | private ParsingState | _charParsingStateparsing state for characters | private ParsingState | _parParsingStateparsing state for paragraphs | byte[] | _mainDocumentmain document stream buffer | byte[] | _tableBuffertable stream buffer |
Constructors Summary |
---|
protected HDFObjectFactory(InputStream istream, HDFLowLevelParsingListener l)Creates a new instance of HDFObjectFactory
if (l == null)
{
_listener = new HDFObjectModel();
}
else
{
_listener = l;
}
//do Ole stuff
_filesystem = new POIFSFileSystem(istream);
DocumentEntry headerProps =
(DocumentEntry)_filesystem.getRoot().getEntry("WordDocument");
_mainDocument = new byte[headerProps.getSize()];
_filesystem.createDocumentInputStream("WordDocument").read(_mainDocument);
_fib = new FileInformationBlock(_mainDocument);
initTableStream();
initTextPieces();
initFormattingProperties();
| public HDFObjectFactory(InputStream istream)Creates a new instance of HDFObjectFactory
this(istream, null);
|
Methods Summary |
---|
private void | createFontTable()Initializes this document's FontTable;
int fontTableIndex = _fib.getFcSttbfffn();
int fontTableSize = _fib.getLcbSttbfffn();
byte[] fontTable = new byte[fontTableSize];
System.arraycopy(_tableBuffer, fontTableIndex, fontTable, 0, fontTableSize);
_listener.fonts(new FontTable(fontTable));
| private void | createListTables()Initializes the list tables for this document
int lfoOffset = _fib.getFcPlfLfo();
int lfoSize = _fib.getLcbPlfLfo();
byte[] plflfo = new byte[lfoSize];
System.arraycopy(_tableBuffer, lfoOffset, plflfo, 0, lfoSize);
int lstOffset = _fib.getFcPlcfLst();
int lstSize = _fib.getLcbPlcfLst();
if (lstOffset > 0 && lstSize > 0)
{
// The lstSize returned by _fib.getLcbPlcfLst() doesn't appear
// to take into account any LVLs. Therefore, we recalculate
// lstSize based on where the LFO section begins (because the
// LFO section immediately follows the LST section).
lstSize = lfoOffset - lstOffset;
byte[] plcflst = new byte[lstSize];
System.arraycopy(_tableBuffer, lstOffset, plcflst, 0, lstSize);
_listener.lists(new ListTables(plcflst, plflfo));
}
| private void | createStyleSheet()Uncompresses the StyleSheet from file into memory.
int stshIndex = _fib.getFcStshf();
int stshSize = _fib.getLcbStshf();
byte[] stsh = new byte[stshSize];
System.arraycopy(_tableBuffer, stshIndex, stsh, 0, stshSize);
_listener.styleSheet(new StyleSheet(stsh));
| public static java.util.List | getTypes(java.io.InputStream istream)
List results = new ArrayList(1);
//do Ole stuff
POIFSFileSystem filesystem = new POIFSFileSystem(istream);
DocumentEntry headerProps =
(DocumentEntry)filesystem.getRoot().getEntry("WordDocument");
byte[] mainDocument = new byte[headerProps.getSize()];
filesystem.createDocumentInputStream("WordDocument").read(mainDocument);
FileInformationBlock fib = new FileInformationBlock(mainDocument);
results.add(fib);
return results;
| private void | initCharacterProperties(int charOffset, org.apache.poi.hdf.model.hdftypes.PlexOfCps charPlcf, int start, int end)
//Initialize paragraph property stuff
//int currentCharPage = _charParsingState.getCurrentPage();
int charPlcfLen = charPlcf.length();
int currentPageIndex = _charParsingState.getCurrentPageIndex();
FormattedDiskPage fkp = _charParsingState.getFkp();
int currentChpxIndex = _charParsingState.getCurrentPropIndex();
int currentArraySize = fkp.size();
//get the character runs for this paragraph
int charStart = 0;
int charEnd = 0;
//add the character runs
do
{
if (currentChpxIndex < currentArraySize)
{
charStart = fkp.getStart(currentChpxIndex);
charEnd = fkp.getEnd(currentChpxIndex);
byte[] chpx = fkp.getGrpprl(currentChpxIndex);
_listener.characterRun(new ChpxNode(Math.max(charStart, start), Math.min(charEnd, end), chpx));
if (charEnd < end)
{
currentChpxIndex++;
}
else
{
_charParsingState.setState(currentPageIndex, fkp, currentChpxIndex);
break;
}
}
else
{
int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(++currentPageIndex));
byte[] byteFkp = new byte[512];
System.arraycopy(_mainDocument, (currentCharPage * 512), byteFkp, 0, 512);
fkp = new CHPFormattedDiskPage(byteFkp);
currentChpxIndex = 0;
currentArraySize = fkp.size();
}
}
while(currentPageIndex < charPlcfLen);
| private void | initDocumentProperties()Initializes the DocumentProperties object unique to this document.
int pos = _fib.getFcDop();
int size = _fib.getLcbDop();
byte[] dopArray = new byte[size];
System.arraycopy(_tableBuffer, pos, dopArray, 0, size);
_listener.document(new DocumentProperties(dopArray));
| private void | initFormattingProperties()initializes all of the formatting properties for a Word Document
createStyleSheet();
createListTables();
createFontTable();
initDocumentProperties();
initSectionProperties();
//initCharacterProperties();
//initParagraphProperties();
| private void | initParagraphProperties()intializes the Paragraph Properties BTree
//paragraphs
int parOffset = _fib.getFcPlcfbtePapx();
int parPlcSize = _fib.getLcbPlcfbtePapx();
//characters
int charOffset = _fib.getFcPlcfbteChpx();
int charPlcSize = _fib.getLcbPlcfbteChpx();
PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4);
PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4);
//Initialize character property stuff
int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(0));
int charPlcfLen = charPlcf.length();
int currentPageIndex = 0;
byte[] fkp = new byte[512];
System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512);
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp);
int currentChpxIndex = 0;
int currentArraySize = cfkp.size();
int arraySize = parPlcf.length();
//first we must go through the bin table and find the fkps
for(int x = 0; x < arraySize; x++)
{
int PN = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(x));
fkp = new byte[512];
System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512);
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp);
//take each fkp and get the paps
int crun = pfkp.size();
for(int y = 0; y < crun; y++)
{
//get the beginning fc of each paragraph text run
int fcStart = pfkp.getStart(y);
int fcEnd = pfkp.getEnd(y);
//get the papx for this paragraph
byte[] papx = pfkp.getGrpprl(y);
_listener.paragraph(new PapxNode(fcStart, fcEnd, papx));
//get the character runs for this paragraph
int charStart = 0;
int charEnd = 0;
//add the character runs
do
{
if (currentChpxIndex < currentArraySize)
{
charStart = cfkp.getStart(currentChpxIndex);
charEnd = cfkp.getEnd(currentChpxIndex);
byte[] chpx = cfkp.getGrpprl(currentChpxIndex);
_listener.characterRun(new ChpxNode(charStart, charEnd, chpx));
if (charEnd < fcEnd)
{
currentChpxIndex++;
}
else
{
break;
}
}
else
{
currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(++currentPageIndex));
fkp = new byte[512];
System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512);
cfkp = new CHPFormattedDiskPage(fkp);
currentChpxIndex = 0;
currentArraySize = cfkp.size();
}
}
while(currentCharPage <= charPlcfLen + 1);
}
}
| private void | initParagraphProperties(int parOffset, org.apache.poi.hdf.model.hdftypes.PlexOfCps parPlcf, int charOffset, org.apache.poi.hdf.model.hdftypes.PlexOfCps charPlcf, int start, int end)
//Initialize paragraph property stuff
//int currentParPage = _parParsingState.getCurrentPage();
int parPlcfLen = parPlcf.length();
int currentPageIndex = _parParsingState.getCurrentPageIndex();
FormattedDiskPage fkp = _parParsingState.getFkp();
int currentPapxIndex = _parParsingState.getCurrentPropIndex();
int currentArraySize = fkp.size();
do
{
if (currentPapxIndex < currentArraySize)
{
int parStart = fkp.getStart(currentPapxIndex);
int parEnd = fkp.getEnd(currentPapxIndex);
byte[] papx = fkp.getGrpprl(currentPapxIndex);
_listener.paragraph(new PapxNode(Math.max(parStart, start), Math.min(parEnd, end), papx));
initCharacterProperties(charOffset, charPlcf, Math.max(start, parStart), Math.min(parEnd, end));
if (parEnd < end)
{
currentPapxIndex++;
}
else
{
//save the state
_parParsingState.setState(currentPageIndex, fkp, currentPapxIndex);
break;
}
}
else
{
int currentParPage = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(++currentPageIndex));
byte byteFkp[] = new byte[512];
System.arraycopy(_mainDocument, (currentParPage * 512), byteFkp, 0, 512);
fkp = new PAPFormattedDiskPage(byteFkp);
currentPapxIndex = 0;
currentArraySize = fkp.size();
}
}
while(currentPageIndex < parPlcfLen);
| private void | initParsingStates(int parOffset, org.apache.poi.hdf.model.hdftypes.PlexOfCps parPlcf, int charOffset, org.apache.poi.hdf.model.hdftypes.PlexOfCps charPlcf)
int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(0));
byte[] fkp = new byte[512];
System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512);
CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp);
_charParsingState = new ParsingState(currentCharPage, cfkp);
int currentParPage = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(0));
fkp = new byte[512];
System.arraycopy(_mainDocument, (currentParPage * 512), fkp, 0, 512);
PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp);
_parParsingState = new ParsingState(currentParPage, pfkp);
| private void | initSectionProperties()initializes the SectionProperties BTree
int ccpText = _fib.getCcpText();
int ccpFtn = _fib.getCcpFtn();
//sections
int fcMin = _fib.getFcMin();
int plcfsedFC = _fib.getFcPlcfsed();
int plcfsedSize = _fib.getLcbPlcfsed();
//paragraphs
int parOffset = _fib.getFcPlcfbtePapx();
int parPlcSize = _fib.getLcbPlcfbtePapx();
//characters
int charOffset = _fib.getFcPlcfbteChpx();
int charPlcSize = _fib.getLcbPlcfbteChpx();
PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4);
PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4);
initParsingStates(parOffset, parPlcf, charOffset, charPlcf);
//byte[] plcfsed = new byte[plcfsedSize];
//System.arraycopy(_tableBuffer, plcfsedFC, plcfsed, 0, plcfsedSize);
PlexOfCps plcfsed = new PlexOfCps(plcfsedSize, 12);
int arraySize = plcfsed.length();
int start = fcMin;
int end = fcMin + ccpText;
int x = 0;
int sectionEnd = 0;
//do the main body sections
while (x < arraySize)
{
int sectionStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x)) + fcMin;
sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x + 1)) + fcMin;
int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getStructOffset(x) + 2);
int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart);
byte[] sepx = new byte[sepxSize];
System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize);
SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx);
_listener.bodySection(node);
initParagraphProperties(parOffset, parPlcf, charOffset, charPlcf, sectionStart, Math.min(end, sectionEnd));
if (sectionEnd > end)
{
break;
}
else
{
x++;
}
}
//do the header sections
for (; x < arraySize; x++)// && sectionEnd <= end; x++)
{
int sectionStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x)) + fcMin;
sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x + 1)) + fcMin;
int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getStructOffset(x) + 2);
int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart);
byte[] sepx = new byte[sepxSize];
System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize);
SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx);
_listener.hdrSection(node);
initParagraphProperties(parOffset, parPlcf, charOffset, charPlcf, Math.max(sectionStart, end), sectionEnd);
}
_listener.endSections();
| private void | initTableStream()Initializes the table stream
String tablename = null;
if(_fib.isFWhichTblStm())
{
tablename="1Table";
}
else
{
tablename="0Table";
}
DocumentEntry tableEntry = (DocumentEntry)_filesystem.getRoot().getEntry(tablename);
//load the table stream into a buffer
int size = tableEntry.getSize();
_tableBuffer = new byte[size];
_filesystem.createDocumentInputStream(tablename).read(_tableBuffer);
| private void | initTextPieces()Initializes the text pieces. Text is divided into pieces because some
"pieces" may only contain unicode characters.
int pos = _fib.getFcClx();
//skips through the prms before we reach the piece table. These contain data
//for actual fast saved files
while (_tableBuffer[pos] == 1)
{
pos++;
int skip = LittleEndian.getShort(_tableBuffer, pos);
pos += 2 + skip;
}
if(_tableBuffer[pos] != 2)
{
throw new IOException("The text piece table is corrupted");
}
else
{
//parse out the text pieces
int pieceTableSize = LittleEndian.getInt(_tableBuffer, ++pos);
pos += 4;
int pieces = (pieceTableSize - 4) / 12;
for (int x = 0; x < pieces; x++)
{
int filePos = LittleEndian.getInt(_tableBuffer, pos + ((pieces + 1) * 4) + (x * 8) + 2);
boolean unicode = false;
if ((filePos & 0x40000000) == 0)
{
unicode = true;
}
else
{
unicode = false;
filePos &= ~(0x40000000);//gives me FC in doc stream
filePos /= 2;
}
int totLength = LittleEndian.getInt(_tableBuffer, pos + (x + 1) * 4) -
LittleEndian.getInt(_tableBuffer, pos + (x * 4));
TextPiece piece = new TextPiece(filePos, totLength, unicode);
_listener.text(piece);
}
}
| public static void | main(java.lang.String[] args)
try
{
HDFObjectFactory f = new HDFObjectFactory(new FileInputStream("c:\\test.doc"));
int k = 0;
}
catch(Throwable t)
{
t.printStackTrace();
}
|
|