PowerPointExtractorpublic class PowerPointExtractor extends Object This class can be used to extract text from a PowerPoint file.
Can optionally also get the notes from one. |
Fields Summary |
---|
private HSLFSlideShow | _hslfshow | private SlideShow | _show | private Slide[] | _slides | private Notes[] | _notes |
Constructors Summary |
---|
public PowerPointExtractor(String fileName)Creates a PowerPointExtractor, from a file
_hslfshow = new HSLFSlideShow(fileName);
_show = new SlideShow(_hslfshow);
_slides = _show.getSlides();
_notes = _show.getNotes();
| public PowerPointExtractor(InputStream iStream)Creates a PowerPointExtractor, from an Input Stream
_hslfshow = new HSLFSlideShow(iStream);
_show = new SlideShow(_hslfshow);
_slides = _show.getSlides();
_notes = _show.getNotes();
| public PowerPointExtractor(POIFSFileSystem fs)Creates a PowerPointExtractor, from an open POIFSFileSystem
_hslfshow = new HSLFSlideShow(fs);
_show = new SlideShow(_hslfshow);
_slides = _show.getSlides();
_notes = _show.getNotes();
| public PowerPointExtractor(HSLFSlideShow ss)Creates a PowerPointExtractor, from a HSLFSlideShow
_hslfshow = ss;
_show = new SlideShow(_hslfshow);
_slides = _show.getSlides();
_notes = _show.getNotes();
|
Methods Summary |
---|
public void | close()Shuts down the underlying streams
_hslfshow.close();
_hslfshow = null;
_show = null;
_slides = null;
_notes = null;
| public java.lang.String | getNotes()Fetches all the notes text from the slideshow, but not the slide text
return getText(false,true);
| public java.lang.String | getText()Fetches all the slide text from the slideshow, but not the notes
return getText(true,false);
| public java.lang.String | getText(boolean getSlideText, boolean getNoteText)Fetches text from the slideshow, be it slide text or note text.
Because the final block of text in a TextRun normally have their
last \n stripped, we add it back
StringBuffer ret = new StringBuffer();
if(getSlideText) {
for(int i=0; i<_slides.length; i++) {
Slide slide = _slides[i];
TextRun[] runs = slide.getTextRuns();
for(int j=0; j<runs.length; j++) {
TextRun run = runs[j];
if(run != null) {
String text = run.getText();
ret.append(text);
if(! text.endsWith("\n")) {
ret.append("\n");
}
}
}
}
if(getNoteText) {
ret.append(" ");
}
}
if(getNoteText) {
// Not currently using _notes, as that can have the notes of
// master sheets in. Grab Slide list, then work from there,
// but ensure no duplicates
HashSet seenNotes = new HashSet();
for(int i=0; i<_slides.length; i++) {
Notes notes = _slides[i].getNotesSheet();
if(notes == null) { continue; }
Integer id = new Integer(notes._getSheetNumber());
if(seenNotes.contains(id)) { continue; }
seenNotes.add(id);
TextRun[] runs = notes.getTextRuns();
if(runs != null && runs.length > 0) {
for(int j=0; j<runs.length; j++) {
TextRun run = runs[j];
String text = run.getText();
ret.append(text);
if(! text.endsWith("\n")) {
ret.append("\n");
}
}
}
}
}
return ret.toString();
| public static void | main(java.lang.String[] args)Basic extractor. Returns all the text, and optionally all the notes
if(args.length < 1) {
System.err.println("Useage:");
System.err.println("\tPowerPointExtractor [-notes] <file>");
System.exit(1);
}
boolean notes = false;
String file;
if(args.length > 1) {
notes = true;
file = args[1];
} else {
file = args[0];
}
PowerPointExtractor ppe = new PowerPointExtractor(file);
System.out.println(ppe.getText(true,notes));
ppe.close();
|
|