FileDocCategorySizeDatePackage
WordExtractor.javaAPI DocApache Poi 3.0.14780Sun Mar 11 12:59:30 GMT 2007org.apache.poi.hwpf.extractor

WordExtractor

public class WordExtractor extends Object
Class to extract the text from a Word Document. You should use either getParagraphText() or getText() unless you have a strong reason otherwise.
author
Nick Burch (nick at torchbox dot com)

Fields Summary
private POIFSFileSystem
fs
private HWPFDocument
doc
Constructors Summary
public WordExtractor(InputStream is)
Create a new Word Extractor

param
is InputStream containing the word file

		this( HWPFDocument.verifyAndBuildPOIFS(is) );
	
public WordExtractor(POIFSFileSystem fs)
Create a new Word Extractor

param
fs POIFSFileSystem containing the word file

		this(new HWPFDocument(fs));
		this.fs = fs;
	
public WordExtractor(HWPFDocument doc)
Create a new Word Extractor

param
doc The HWPFDocument to extract from

		this.doc = doc;
	
Methods Summary
public java.lang.String[]getParagraphText()
Get the text from the word file, as an array with one String per paragraph

		String[] ret;
		
		// Extract using the model code
		try {
	    	Range r = doc.getRange();

			ret = new String[r.numParagraphs()];
			for(int i=0; i<ret.length; i++) {
				Paragraph p = r.getParagraph(i);
				ret[i] = p.text();
				
				// Fix the line ending
				if(ret[i].endsWith("\r")) {
					ret[i] = ret[i] + "\n";
				}
			}
		} catch(Exception e) {
			// Something's up with turning the text pieces into paragraphs
			// Fall back to ripping out the text pieces
			ret = new String[1];
			ret[0] = getTextFromPieces();
		}
		
		return ret;
	
public java.lang.StringgetText()
Grab the text, based on the paragraphs. Shouldn't include any crud, but slightly slower than getTextFromPieces().

		StringBuffer ret = new StringBuffer();
		String[] text = getParagraphText();
		for(int i=0; i<text.length; i++) {
			ret.append(text[i]);
		}
		return ret.toString();
	
public java.lang.StringgetTextFromPieces()
Grab the text out of the text pieces. Might also include various bits of crud, but will work in cases where the text piece -> paragraph mapping is broken. Fast too.

    	StringBuffer textBuf = new StringBuffer();
    	
    	Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
    	while (textPieces.hasNext()) {
    		TextPiece piece = (TextPiece) textPieces.next();

    		String encoding = "Cp1252";
    		if (piece.usesUnicode()) {
    			encoding = "UTF-16LE";
    		}
    		try {
    			String text = new String(piece.getRawBytes(), encoding);
    			textBuf.append(text);
    		} catch(UnsupportedEncodingException e) {
    			throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
    		}
    	}
    	
    	String text = textBuf.toString();
    	
    	// Fix line endings (Note - won't get all of them
    	text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
    	text = text.replaceAll("\r\r", "\r\n\r\n");
    	
    	if(text.endsWith("\r")) {
    		text += "\n";
    	}
    	
    	return text;
	
public static voidmain(java.lang.String[] args)
Command line extractor, so people will stop moaning that they can't just run this.

		if(args.length == 0) {
			System.err.println("Use:");
			System.err.println("   java org.apache.poi.hwpf.extractor.WordExtractor <filename>");
			System.exit(1);
		}

		// Process the first argument as a file
		FileInputStream fin = new FileInputStream(args[0]);
		WordExtractor extractor = new WordExtractor(fin);
		System.out.println(extractor.getText());