FileDocCategorySizeDatePackage
PowerPointExtractor.javaAPI DocApache Poi 3.0.15405Mon Jan 01 18:55:34 GMT 2007org.apache.poi.hslf.extractor

PowerPointExtractor

public class PowerPointExtractor extends Object
This class can be used to extract text from a PowerPoint file. Can optionally also get the notes from one.
author
Nick Burch

Fields Summary
private HSLFSlideShow
_hslfshow
private SlideShow
_show
private Slide[]
_slides
private Notes[]
_notes
Constructors Summary
public PowerPointExtractor(String fileName)
Creates a PowerPointExtractor, from a file

param
fileName The name of the file to extract from

	_hslfshow = new HSLFSlideShow(fileName);
	_show = new SlideShow(_hslfshow);
	_slides = _show.getSlides();
	_notes = _show.getNotes();
  
public PowerPointExtractor(InputStream iStream)
Creates a PowerPointExtractor, from an Input Stream

param
iStream The input stream containing the PowerPoint document

	_hslfshow = new HSLFSlideShow(iStream);
	_show = new SlideShow(_hslfshow);
	_slides = _show.getSlides();
	_notes = _show.getNotes();
  
public PowerPointExtractor(POIFSFileSystem fs)
Creates a PowerPointExtractor, from an open POIFSFileSystem

param
fs the POIFSFileSystem containing the PowerPoint document

	_hslfshow = new HSLFSlideShow(fs);
	_show = new SlideShow(_hslfshow);
	_slides = _show.getSlides();
	_notes = _show.getNotes();
  
public PowerPointExtractor(HSLFSlideShow ss)
Creates a PowerPointExtractor, from a HSLFSlideShow

param
ss the HSLFSlideShow to extract text from

	_hslfshow = ss;
	_show = new SlideShow(_hslfshow);
	_slides = _show.getSlides();
	_notes = _show.getNotes();
  
Methods Summary
public voidclose()
Shuts down the underlying streams

	_hslfshow.close();
	_hslfshow = null;
	_show = null;
	_slides = null;
	_notes = null;
  
public java.lang.StringgetNotes()
Fetches all the notes text from the slideshow, but not the slide text

	return getText(false,true);
  
public java.lang.StringgetText()
Fetches all the slide text from the slideshow, but not the notes

	return getText(true,false);
  
public java.lang.StringgetText(boolean getSlideText, boolean getNoteText)
Fetches text from the slideshow, be it slide text or note text. Because the final block of text in a TextRun normally have their last \n stripped, we add it back

param
getSlideText fetch slide text
param
getNoteText fetch note text

	StringBuffer ret = new StringBuffer(); 

	if(getSlideText) {
		for(int i=0; i<_slides.length; i++) {
			Slide slide = _slides[i];
			TextRun[] runs = slide.getTextRuns();
			for(int j=0; j<runs.length; j++) {
				TextRun run = runs[j];
				if(run != null) {
					String text = run.getText();
					ret.append(text);
					if(! text.endsWith("\n")) {
						ret.append("\n");
					}
				}
			}
		}
		if(getNoteText) {
			ret.append(" ");
		}
	}

	if(getNoteText) {
		// Not currently using _notes, as that can have the notes of
		//  master sheets in. Grab Slide list, then work from there,
		//  but ensure no duplicates
		HashSet seenNotes = new HashSet();
		for(int i=0; i<_slides.length; i++) {
			Notes notes = _slides[i].getNotesSheet();
			if(notes == null) { continue; }
			Integer id = new Integer(notes._getSheetNumber());
			if(seenNotes.contains(id)) { continue; }
			seenNotes.add(id);

			TextRun[] runs = notes.getTextRuns();
			if(runs != null && runs.length > 0) {
				for(int j=0; j<runs.length; j++) {
					TextRun run = runs[j];
					String text = run.getText();
					ret.append(text);
					if(! text.endsWith("\n")) {
						ret.append("\n");
					}
				}
			}
		}
	}

	return ret.toString();
  
public static voidmain(java.lang.String[] args)
Basic extractor. Returns all the text, and optionally all the notes

	if(args.length < 1) {
		System.err.println("Useage:");
		System.err.println("\tPowerPointExtractor [-notes] <file>");
		System.exit(1);
	}

	boolean notes = false;
	String file;
	if(args.length > 1) {
		notes = true;
		file = args[1];
	} else {
		file = args[0];
	}

	PowerPointExtractor ppe = new PowerPointExtractor(file);
	System.out.println(ppe.getText(true,notes));
	ppe.close();