FileDocCategorySizeDatePackage
HDFObjectFactory.javaAPI DocApache Poi 3.0.121600Mon Jan 01 18:55:22 GMT 2007org.apache.poi.hdf.model

HDFObjectFactory

public class HDFObjectFactory extends Object
The Object Factory takes in a stream and creates the low level objects that represent the data.
author
andy

Fields Summary
private POIFSFileSystem
_filesystem
OLE stuff
private FileInformationBlock
_fib
The FIB
private HDFLowLevelParsingListener
_listener
Used to set up the object model
private ParsingState
_charParsingState
parsing state for characters
private ParsingState
_parParsingState
parsing state for paragraphs
byte[]
_mainDocument
main document stream buffer
byte[]
_tableBuffer
table stream buffer
Constructors Summary
protected HDFObjectFactory(InputStream istream, HDFLowLevelParsingListener l)
Creates a new instance of HDFObjectFactory

param
istream The InputStream that is the Word document

        if (l == null)
        {
            _listener = new HDFObjectModel();
        }
        else
        {
            _listener = l;
        }

        //do Ole stuff
        _filesystem = new POIFSFileSystem(istream);

        DocumentEntry headerProps =
            (DocumentEntry)_filesystem.getRoot().getEntry("WordDocument");

        _mainDocument = new byte[headerProps.getSize()];
        _filesystem.createDocumentInputStream("WordDocument").read(_mainDocument);

        _fib = new FileInformationBlock(_mainDocument);

        initTableStream();
        initTextPieces();
        initFormattingProperties();


    
public HDFObjectFactory(InputStream istream)
Creates a new instance of HDFObjectFactory

param
istream The InputStream that is the Word document

        this(istream, null);
    
Methods Summary
private voidcreateFontTable()
Initializes this document's FontTable;

        int fontTableIndex = _fib.getFcSttbfffn();
        int fontTableSize = _fib.getLcbSttbfffn();
        byte[] fontTable = new byte[fontTableSize];
        System.arraycopy(_tableBuffer, fontTableIndex, fontTable, 0, fontTableSize);
        _listener.fonts(new FontTable(fontTable));
    
private voidcreateListTables()
Initializes the list tables for this document

        int lfoOffset = _fib.getFcPlfLfo();
        int lfoSize = _fib.getLcbPlfLfo();
        byte[] plflfo = new byte[lfoSize];

        System.arraycopy(_tableBuffer, lfoOffset, plflfo, 0, lfoSize);

        int lstOffset = _fib.getFcPlcfLst();
        int lstSize = _fib.getLcbPlcfLst();
        if (lstOffset > 0 && lstSize > 0)
        {
          //  The lstSize returned by _fib.getLcbPlcfLst() doesn't appear
          //  to take into account any LVLs.  Therefore, we recalculate
          //  lstSize based on where the LFO section begins (because the
          //  LFO section immediately follows the LST section).
          lstSize = lfoOffset - lstOffset;
          byte[] plcflst = new byte[lstSize];
          System.arraycopy(_tableBuffer, lstOffset, plcflst, 0, lstSize);
          _listener.lists(new ListTables(plcflst, plflfo));
        }
    
private voidcreateStyleSheet()
Uncompresses the StyleSheet from file into memory.

      int stshIndex = _fib.getFcStshf();
      int stshSize = _fib.getLcbStshf();
      byte[] stsh = new byte[stshSize];
      System.arraycopy(_tableBuffer, stshIndex, stsh, 0, stshSize);

      _listener.styleSheet(new StyleSheet(stsh));
    
public static java.util.ListgetTypes(java.io.InputStream istream)

        List results = new ArrayList(1);

        //do Ole stuff
        POIFSFileSystem filesystem = new POIFSFileSystem(istream);

        DocumentEntry headerProps =
            (DocumentEntry)filesystem.getRoot().getEntry("WordDocument");

        byte[] mainDocument = new byte[headerProps.getSize()];
        filesystem.createDocumentInputStream("WordDocument").read(mainDocument);

        FileInformationBlock fib = new FileInformationBlock(mainDocument);


        results.add(fib);
        return results;
    
private voidinitCharacterProperties(int charOffset, org.apache.poi.hdf.model.hdftypes.PlexOfCps charPlcf, int start, int end)

        //Initialize paragraph property stuff
        //int currentCharPage = _charParsingState.getCurrentPage();
        int charPlcfLen = charPlcf.length();
        int currentPageIndex = _charParsingState.getCurrentPageIndex();
        FormattedDiskPage fkp = _charParsingState.getFkp();
        int currentChpxIndex = _charParsingState.getCurrentPropIndex();
        int currentArraySize = fkp.size();

        //get the character runs for this paragraph
        int charStart = 0;
        int charEnd = 0;
        //add the character runs
        do
        {
          if (currentChpxIndex < currentArraySize)
          {
            charStart = fkp.getStart(currentChpxIndex);
            charEnd = fkp.getEnd(currentChpxIndex);
            byte[] chpx = fkp.getGrpprl(currentChpxIndex);
            _listener.characterRun(new ChpxNode(Math.max(charStart, start),  Math.min(charEnd, end), chpx));

            if (charEnd < end)
            {
              currentChpxIndex++;
            }
            else
            {
              _charParsingState.setState(currentPageIndex, fkp, currentChpxIndex);
              break;
            }
          }
          else
          {
            int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(++currentPageIndex));
            byte[] byteFkp = new byte[512];
            System.arraycopy(_mainDocument, (currentCharPage * 512), byteFkp, 0, 512);
            fkp = new CHPFormattedDiskPage(byteFkp);
            currentChpxIndex = 0;
            currentArraySize = fkp.size();
          }
        }
        while(currentPageIndex < charPlcfLen);
    
private voidinitDocumentProperties()
Initializes the DocumentProperties object unique to this document.

        int pos = _fib.getFcDop();
        int size = _fib.getLcbDop();
        byte[] dopArray = new byte[size];

        System.arraycopy(_tableBuffer, pos, dopArray, 0, size);
        _listener.document(new DocumentProperties(dopArray));
    
private voidinitFormattingProperties()
initializes all of the formatting properties for a Word Document

        createStyleSheet();
        createListTables();
        createFontTable();

        initDocumentProperties();
        initSectionProperties();
        //initCharacterProperties();
        //initParagraphProperties();
    
private voidinitParagraphProperties()
intializes the Paragraph Properties BTree

        //paragraphs
        int parOffset = _fib.getFcPlcfbtePapx();
        int parPlcSize = _fib.getLcbPlcfbtePapx();

        //characters
        int charOffset = _fib.getFcPlcfbteChpx();
        int charPlcSize = _fib.getLcbPlcfbteChpx();

        PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4);
        PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4);

        //Initialize character property stuff
        int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(0));
        int charPlcfLen = charPlcf.length();
        int currentPageIndex = 0;
        byte[] fkp = new byte[512];
        System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512);
        CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp);
        int currentChpxIndex = 0;
        int currentArraySize = cfkp.size();


        int arraySize = parPlcf.length();

        //first we must go through the bin table and find the fkps
        for(int x = 0; x < arraySize; x++)
        {
            int PN = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(x));

            fkp = new byte[512];
            System.arraycopy(_mainDocument, (PN * 512), fkp, 0, 512);

            PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp);
            //take each fkp and get the paps
            int crun = pfkp.size();
            for(int y = 0; y < crun; y++)
            {
                //get the beginning fc of each paragraph text run
                int fcStart = pfkp.getStart(y);
                int fcEnd = pfkp.getEnd(y);

                //get the papx for this paragraph
                byte[] papx = pfkp.getGrpprl(y);

                _listener.paragraph(new PapxNode(fcStart, fcEnd, papx));

                //get the character runs for this paragraph
                int charStart = 0;
                int charEnd = 0;
                //add the character runs
                do
                {
                  if (currentChpxIndex < currentArraySize)
                  {
                    charStart = cfkp.getStart(currentChpxIndex);
                    charEnd = cfkp.getEnd(currentChpxIndex);
                    byte[] chpx = cfkp.getGrpprl(currentChpxIndex);
                    _listener.characterRun(new ChpxNode(charStart, charEnd, chpx));
                    if (charEnd < fcEnd)
                    {
                      currentChpxIndex++;
                    }
                    else
                    {
                      break;
                    }
                  }
                  else
                  {
                    currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(++currentPageIndex));
                    fkp = new byte[512];
                    System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512);
                    cfkp = new CHPFormattedDiskPage(fkp);
                    currentChpxIndex = 0;
                    currentArraySize = cfkp.size();
                  }
                }
                while(currentCharPage <= charPlcfLen + 1);

            }

        }

    
private voidinitParagraphProperties(int parOffset, org.apache.poi.hdf.model.hdftypes.PlexOfCps parPlcf, int charOffset, org.apache.poi.hdf.model.hdftypes.PlexOfCps charPlcf, int start, int end)

        //Initialize paragraph property stuff
        //int currentParPage = _parParsingState.getCurrentPage();
        int parPlcfLen = parPlcf.length();
        int currentPageIndex = _parParsingState.getCurrentPageIndex();
        FormattedDiskPage fkp = _parParsingState.getFkp();
        int currentPapxIndex = _parParsingState.getCurrentPropIndex();
        int currentArraySize = fkp.size();

        do
        {
          if (currentPapxIndex < currentArraySize)
          {
            int parStart = fkp.getStart(currentPapxIndex);
            int parEnd = fkp.getEnd(currentPapxIndex);
            byte[] papx = fkp.getGrpprl(currentPapxIndex);
            _listener.paragraph(new PapxNode(Math.max(parStart, start), Math.min(parEnd, end), papx));
            initCharacterProperties(charOffset, charPlcf, Math.max(start, parStart), Math.min(parEnd, end));
            if (parEnd < end)
            {
              currentPapxIndex++;
            }
            else
            {
              //save the state
              _parParsingState.setState(currentPageIndex, fkp, currentPapxIndex);
              break;
            }
          }
          else
          {
            int currentParPage = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(++currentPageIndex));
            byte byteFkp[] = new byte[512];
            System.arraycopy(_mainDocument, (currentParPage * 512), byteFkp, 0, 512);
            fkp = new PAPFormattedDiskPage(byteFkp);
            currentPapxIndex = 0;
            currentArraySize = fkp.size();
          }
        }
        while(currentPageIndex < parPlcfLen);
    
private voidinitParsingStates(int parOffset, org.apache.poi.hdf.model.hdftypes.PlexOfCps parPlcf, int charOffset, org.apache.poi.hdf.model.hdftypes.PlexOfCps charPlcf)

        int currentCharPage = LittleEndian.getInt(_tableBuffer, charOffset + charPlcf.getStructOffset(0));
        byte[] fkp = new byte[512];
        System.arraycopy(_mainDocument, (currentCharPage * 512), fkp, 0, 512);
        CHPFormattedDiskPage cfkp = new CHPFormattedDiskPage(fkp);
        _charParsingState = new ParsingState(currentCharPage, cfkp);

        int currentParPage = LittleEndian.getInt(_tableBuffer, parOffset + parPlcf.getStructOffset(0));
        fkp = new byte[512];
        System.arraycopy(_mainDocument, (currentParPage * 512), fkp, 0, 512);
        PAPFormattedDiskPage pfkp = new PAPFormattedDiskPage(fkp);
        _parParsingState = new ParsingState(currentParPage, pfkp);
    
private voidinitSectionProperties()
initializes the SectionProperties BTree


      int ccpText = _fib.getCcpText();
      int ccpFtn = _fib.getCcpFtn();

      //sections
      int fcMin = _fib.getFcMin();
      int plcfsedFC = _fib.getFcPlcfsed();
      int plcfsedSize = _fib.getLcbPlcfsed();

      //paragraphs
      int parOffset = _fib.getFcPlcfbtePapx();
      int parPlcSize = _fib.getLcbPlcfbtePapx();

      //characters
      int charOffset = _fib.getFcPlcfbteChpx();
      int charPlcSize = _fib.getLcbPlcfbteChpx();

      PlexOfCps charPlcf = new PlexOfCps(charPlcSize, 4);
      PlexOfCps parPlcf = new PlexOfCps(parPlcSize, 4);

      initParsingStates(parOffset, parPlcf, charOffset, charPlcf);

      //byte[] plcfsed = new byte[plcfsedSize];
      //System.arraycopy(_tableBuffer, plcfsedFC, plcfsed, 0, plcfsedSize);

      PlexOfCps plcfsed = new PlexOfCps(plcfsedSize, 12);
      int arraySize = plcfsed.length();

      int start = fcMin;
      int end = fcMin + ccpText;
      int x = 0;
      int sectionEnd = 0;

      //do the main body sections
      while (x < arraySize)
      {
          int sectionStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x)) + fcMin;
          sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x + 1)) + fcMin;
          int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getStructOffset(x) + 2);
          int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart);

          byte[] sepx = new byte[sepxSize];
          System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize);
          SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx);
          _listener.bodySection(node);
          initParagraphProperties(parOffset, parPlcf, charOffset, charPlcf, sectionStart, Math.min(end, sectionEnd));

          if (sectionEnd > end)
          {
            break;
          }
          else
          {
            x++;
          }
      }
      //do the header sections
      for (; x < arraySize; x++)// && sectionEnd <= end; x++)
      {
          int sectionStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x)) + fcMin;
          sectionEnd = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getIntOffset(x + 1)) + fcMin;
          int sepxStart = LittleEndian.getInt(_tableBuffer, plcfsedFC + plcfsed.getStructOffset(x) + 2);
          int sepxSize = LittleEndian.getShort(_mainDocument, sepxStart);

          byte[] sepx = new byte[sepxSize];
          System.arraycopy(_mainDocument, sepxStart + 2, sepx, 0, sepxSize);
          SepxNode node = new SepxNode(x + 1, sectionStart, sectionEnd, sepx);
          _listener.hdrSection(node);
          initParagraphProperties(parOffset, parPlcf, charOffset, charPlcf, Math.max(sectionStart, end), sectionEnd);

      }
      _listener.endSections();
    
private voidinitTableStream()
Initializes the table stream

throws
IOException

        String tablename = null;
        if(_fib.isFWhichTblStm())
        {
            tablename="1Table";
        }
        else
        {
          tablename="0Table";
        }

        DocumentEntry tableEntry = (DocumentEntry)_filesystem.getRoot().getEntry(tablename);

        //load the table stream into a buffer
        int size = tableEntry.getSize();
        _tableBuffer = new byte[size];
        _filesystem.createDocumentInputStream(tablename).read(_tableBuffer);
    
private voidinitTextPieces()
Initializes the text pieces. Text is divided into pieces because some "pieces" may only contain unicode characters.

throws
IOException

        int pos = _fib.getFcClx();

        //skips through the prms before we reach the piece table. These contain data
        //for actual fast saved files
        while (_tableBuffer[pos] == 1)
        {
            pos++;
            int skip = LittleEndian.getShort(_tableBuffer, pos);
            pos += 2 + skip;
        }
        if(_tableBuffer[pos] != 2)
        {
            throw new IOException("The text piece table is corrupted");
        }
        else
        {
            //parse out the text pieces
            int pieceTableSize = LittleEndian.getInt(_tableBuffer, ++pos);
            pos += 4;
            int pieces = (pieceTableSize - 4) / 12;
            for (int x = 0; x < pieces; x++)
            {
                int filePos = LittleEndian.getInt(_tableBuffer, pos + ((pieces + 1) * 4) + (x * 8) + 2);
                boolean unicode = false;
                if ((filePos & 0x40000000) == 0)
                {
                    unicode = true;
                }
                else
                {
                    unicode = false;
                    filePos &= ~(0x40000000);//gives me FC in doc stream
                    filePos /= 2;
                }
                int totLength = LittleEndian.getInt(_tableBuffer, pos + (x + 1) * 4) -
                                LittleEndian.getInt(_tableBuffer, pos + (x * 4));

                TextPiece piece = new TextPiece(filePos, totLength, unicode);
                _listener.text(piece);

            }

        }

    
public static voidmain(java.lang.String[] args)

      try
      {
        HDFObjectFactory f = new HDFObjectFactory(new FileInputStream("c:\\test.doc"));
        int k = 0;
      }
      catch(Throwable t)
      {
        t.printStackTrace();
      }