Apache PDFBox is library which allows you to create PDF documents, manipulate of Existing documents and even extract content from existing documents.
Apache PDFBox provides follwing features :
Please refer to Following before Using :
Java Program to parse PDF document to Text using Apache PDFBox :
Apache PDFBox provides follwing features :
- Text Extraction
- Merging and Splitting
- Forms Filling
- PDF Printing
- PDF/A Validations
- PDF To Image Conversion
- PDF Creation
- Integration with Lucene Search Engine
Please refer to Following before Using :
- API Documentation
- Apache PDFBox jars
- Apache PDFBox for Licensing
Java Program to parse PDF document to Text using Apache PDFBox :
package com.anuj.apachepdfbox; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.util.logging.Level; import java.util.logging.Logger; import org.apache.pdfbox.cos.COSDocument; import org.apache.pdfbox.exceptions.COSVisitorException; import org.apache.pdfbox.pdfparser.PDFParser; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.edit.PDPageContentStream; import org.apache.pdfbox.pdmodel.font.PDType1Font; import org.apache.pdfbox.util.PDFTextStripper; /** * * @author Anuj */ public class PDFTextParser { public PDFTextParser() { } /** * Converts PDF to Text using Apache PDFBox * @param fileName */ public void convertPDFToText(String fileName) { String pdfText = null; PDFParser pdfparser = null; COSDocument cosDocument = null; PDDocument pdDocument = null; File file = new File(fileName); try { InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName); pdfparser = new PDFParser(inputStream); //parse document pdfparser.parse(); cosDocument = pdfparser.getDocument(); pdDocument = new PDDocument(cosDocument); //create PDFTextStipper to convert PDF to Text PDFTextStripper pDFTextStripper = new PDFTextStripper(); pdfText = pDFTextStripper.getText(pdDocument); System.out.println("PDF Content :"); System.out.println(pdfText); } catch (IOException ex) { Logger.getLogger(PDFTextParser.class.getName()).log(Level.SEVERE, null, ex); } finally { //Close documents try { if (cosDocument != null) { cosDocument.close(); } if (pdDocument != null) { pdDocument.close(); } } catch (IOException ex) { Logger.getLogger(PDFTextParser.class.getName()).log(Level.SEVERE, null, ex); } } } /** * Convert Text to PDF using Apache PDFBox * @param fileName */ public void convertTextToPDF(String fileName) { System.out.println("Save Text to PDF File: "+fileName); PDDocument pDDocument = null; PDPage pDPage = null; try { //Create PDDocument pDDocument = new PDDocument(); pDPage = new PDPage(); pDPage.setMediaBox(PDPage.PAGE_SIZE_A4); //Add Page to Document pDDocument.addPage(pDPage); //Create PDPageContentStream PDPageContentStream contentStream = new PDPageContentStream(pDDocument, pDPage); //SetFont PDType1Font font = PDType1Font.COURIER; float fontSize = 8; contentStream.setFont(font, fontSize); //Write data to Contentstream contentStream.beginText(); contentStream.moveTextPositionByAmount(30,800); contentStream.drawString("Hello Anuj, I am created using Apache PDFBox"); contentStream.endText(); contentStream.close(); //Save to Document pDDocument.save(fileName); } catch (IOException ex) { Logger.getLogger(PDFTextParser.class.getName()).log(Level.SEVERE, null, ex); } catch (COSVisitorException ex) { Logger.getLogger(PDFTextParser.class.getName()).log(Level.SEVERE, null, ex); } finally { try { if (pDDocument != null) { pDDocument.close(); } } catch (IOException ex) { Logger.getLogger(PDFTextParser.class.getName()).log(Level.SEVERE, null, ex); } } } public static void main(String[] args) { PDFTextParser pDFTextParser = new PDFTextParser(); pDFTextParser.convertPDFToText("NB Shortcuts.pdf"); //complex one. Would prefer jasper reports or iText than this for writing pDFTextParser.convertTextToPDF("TextToPDF.pdf"); } }
No comments:
Post a Comment