FileDocCategorySizeDatePackage
VisioTextExtractor.javaAPI DocApache Poi 3.0.13790Wed Jun 27 19:48:02 BST 2007org.apache.poi.hdgf.extractor

VisioTextExtractor

public class VisioTextExtractor extends Object
Class to find all the text in a Visio file, and return it. Can opperate on the command line (outputs to stdout), or can return the text for you (eg for use with Lucene).

Fields Summary
private HDGFDiagram
hdgf
private POIFSFileSystem
fs
Constructors Summary
public VisioTextExtractor(HDGFDiagram hdgf)

		this.hdgf = hdgf;
	
public VisioTextExtractor(POIFSFileSystem fs)

		this(new HDGFDiagram(fs));
		this.fs = fs;
	
public VisioTextExtractor(InputStream inp)

		this(new POIFSFileSystem(inp));
	
Methods Summary
private voidfindText(org.apache.poi.hdgf.streams.Stream stream, java.util.ArrayList text)

		if(stream instanceof PointerContainingStream) {
			PointerContainingStream ps = (PointerContainingStream)stream;
			for(int i=0; i<ps.getPointedToStreams().length; i++) {
				findText(ps.getPointedToStreams()[i], text);
			}
		}
		if(stream instanceof ChunkStream) {
			ChunkStream cs = (ChunkStream)stream;
			for(int i=0; i<cs.getChunks().length; i++) {
				if(cs.getChunks()[i] != null && 
						cs.getChunks()[i].getName() != null &&
						cs.getChunks()[i].getName().equals("Text")) {
					// First command
					Command cmd = cs.getChunks()[i].getCommands()[0];
					if(cmd != null && cmd.getValue() != null) {
						text.add( cmd.getValue().toString() );
					}
				}
			}
		}
	
public java.lang.String[]getAllText()
Locates all the text entries in the file, and returns their contents.

		ArrayList text = new ArrayList();
		for(int i=0; i<hdgf.getTopLevelStreams().length; i++) {
			findText(hdgf.getTopLevelStreams()[i], text);
		}
		return (String[])text.toArray( new String[text.size()] );
	
public java.lang.StringgetText()
Returns the textual contents of the file.

		StringBuffer text = new StringBuffer();
		String[] allText = getAllText();
		for(int i=0; i<allText.length; i++) {
			text.append(allText[i]);
			if(!allText[i].endsWith("\r") &&
					!allText[i].endsWith("\n")) {
				text.append("\n");
			}
		}
		return text.toString();
	
public static voidmain(java.lang.String[] args)

		if(args.length == 0) {
			System.err.println("Use:");
			System.err.println("   VisioTextExtractor <file.vsd>");
			System.exit(1);
		}
		
		VisioTextExtractor extractor = 
			new VisioTextExtractor(new FileInputStream(args[0]));
		
		// Print not PrintLn as already has \n added to it
		System.out.print(extractor.getText());