import java.io.FileInputStream; import java.io.InputStream; import org.apache.tika.parser.pdf.PDFParser; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; public class ParsePDFWithTika { public static void main(String args[]) throws Exception { InputStream is = null; try { is = new FileInputStream("C:/Temp/realhowto-vbs-20121221.pdf"); ContentHandler contenthandler = new BodyContentHandler(); Metadata metadata = new Metadata(); PDFParser pdfparser = new PDFParser(); pdfparser.parse(is, contenthandler, metadata, new ParseContext()); System.out.println(contenthandler.toString()); } catch (Exception e) { e.printStackTrace(); } finally { if (is != null) is.close(); } } }
Thursday, 29 May 2014
How to extract text from PDF Using Apache Tika Library In Java
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment