Methods Summary |
---|
public java.lang.String[] | getParagraphText()Get the text from the word file, as an array with one String
per paragraph
String[] ret;
// Extract using the model code
try {
Range r = doc.getRange();
ret = new String[r.numParagraphs()];
for(int i=0; i<ret.length; i++) {
Paragraph p = r.getParagraph(i);
ret[i] = p.text();
// Fix the line ending
if(ret[i].endsWith("\r")) {
ret[i] = ret[i] + "\n";
}
}
} catch(Exception e) {
// Something's up with turning the text pieces into paragraphs
// Fall back to ripping out the text pieces
ret = new String[1];
ret[0] = getTextFromPieces();
}
return ret;
|
public java.lang.String | getText()Grab the text, based on the paragraphs. Shouldn't include any crud,
but slightly slower than getTextFromPieces().
StringBuffer ret = new StringBuffer();
String[] text = getParagraphText();
for(int i=0; i<text.length; i++) {
ret.append(text[i]);
}
return ret.toString();
|
public java.lang.String | getTextFromPieces()Grab the text out of the text pieces. Might also include various
bits of crud, but will work in cases where the text piece -> paragraph
mapping is broken. Fast too.
StringBuffer textBuf = new StringBuffer();
Iterator textPieces = doc.getTextTable().getTextPieces().iterator();
while (textPieces.hasNext()) {
TextPiece piece = (TextPiece) textPieces.next();
String encoding = "Cp1252";
if (piece.usesUnicode()) {
encoding = "UTF-16LE";
}
try {
String text = new String(piece.getRawBytes(), encoding);
textBuf.append(text);
} catch(UnsupportedEncodingException e) {
throw new InternalError("Standard Encoding " + encoding + " not found, JVM broken");
}
}
String text = textBuf.toString();
// Fix line endings (Note - won't get all of them
text = text.replaceAll("\r\r\r", "\r\n\r\n\r\n");
text = text.replaceAll("\r\r", "\r\n\r\n");
if(text.endsWith("\r")) {
text += "\n";
}
return text;
|
public static void | main(java.lang.String[] args)Command line extractor, so people will stop moaning that
they can't just run this.
if(args.length == 0) {
System.err.println("Use:");
System.err.println(" java org.apache.poi.hwpf.extractor.WordExtractor <filename>");
System.exit(1);
}
// Process the first argument as a file
FileInputStream fin = new FileInputStream(args[0]);
WordExtractor extractor = new WordExtractor(fin);
System.out.println(extractor.getText());
|