Sunday, June 13, 2010

Apache PDFBox - Parse PDF to text using java

Apache PDFBox is library which allows you to create PDF documents, manipulate of Existing documents and even extract content from existing documents.

Apache PDFBox provides follwing features :
  • Text Extraction
  • Merging and Splitting
  • Forms Filling
  • PDF Printing
  • PDF/A Validations
  • PDF To Image Conversion
  • PDF Creation
  • Integration with Lucene Search Engine
As mentioned in below example PDFTextParser Class takes Pdf as input and parse provided pdf document into text.

Please refer to Following before Using :
If You are using Maven then pom.xml should have Apache PDFBox Maven Dependencies.



Pom.xml

    4.0.0

    com.anuj
    ApachePDFBox
    1.0-SNAPSHOT
    jar

    ApachePDFBox
    http://maven.apache.org

    
        UTF-8
    

    
        
            junit
            junit
            3.8.1
            test
        
        
            org.apache.pdfbox
            pdfbox
            1.8.2
        
    


Java Program to parse PDF document to Text using Apache PDFBox :
package com.anuj.apachepdfbox;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.pdfbox.cos.COSDocument;
import org.apache.pdfbox.exceptions.COSVisitorException;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.edit.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
import org.apache.pdfbox.util.PDFTextStripper;

/**
 *
 * @author Anuj
 */
public class PDFTextParser {

    public PDFTextParser() {
    }

    /**
     * Converts PDF to Text using Apache PDFBox
     * @param fileName
     */
    public void convertPDFToText(String fileName) {
        String pdfText = null;
        PDFParser pdfparser = null;
        COSDocument cosDocument = null;
        PDDocument pdDocument = null;

        File file = new File(fileName);
        try {
            InputStream inputStream = getClass().getClassLoader().getResourceAsStream(fileName);
            pdfparser = new PDFParser(inputStream);

            //parse document
            pdfparser.parse();

            cosDocument = pdfparser.getDocument();
            pdDocument = new PDDocument(cosDocument);

            //create PDFTextStipper to convert PDF to Text
            PDFTextStripper pDFTextStripper = new PDFTextStripper();
            pdfText = pDFTextStripper.getText(pdDocument);

            System.out.println("PDF Content :");
            System.out.println(pdfText);

        } catch (IOException ex) {
            Logger.getLogger(PDFTextParser.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            //Close documents
            try {
                if (cosDocument != null) {
                    cosDocument.close();
                }
                if (pdDocument != null) {
                    pdDocument.close();
                }
            } catch (IOException ex) {
                Logger.getLogger(PDFTextParser.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    }

    /**
     * Convert Text to PDF using Apache PDFBox
     * @param fileName
     */
    public void convertTextToPDF(String fileName) {
        System.out.println("Save Text to PDF File: "+fileName);
        PDDocument pDDocument = null;
        PDPage pDPage = null;
    
        try {
            //Create PDDocument
            pDDocument = new PDDocument();
            pDPage = new PDPage();
            pDPage.setMediaBox(PDPage.PAGE_SIZE_A4);

            //Add Page to Document
            pDDocument.addPage(pDPage);

            //Create PDPageContentStream
            PDPageContentStream contentStream = new PDPageContentStream(pDDocument, pDPage);

            //SetFont
            PDType1Font font = PDType1Font.COURIER;
            float fontSize = 8;
            contentStream.setFont(font, fontSize);

            //Write data to Contentstream
            contentStream.beginText();
            contentStream.moveTextPositionByAmount(30,800);
            contentStream.drawString("Hello Anuj, I am created using Apache PDFBox");
            contentStream.endText();
            contentStream.close();

            //Save to Document
            pDDocument.save(fileName);

        } catch (IOException ex) {
            Logger.getLogger(PDFTextParser.class.getName()).log(Level.SEVERE, null, ex);
        } catch (COSVisitorException ex) {
            Logger.getLogger(PDFTextParser.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            try {
                if (pDDocument != null) {
                    pDDocument.close();
                }
            } catch (IOException ex) {
                Logger.getLogger(PDFTextParser.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    }

    public static void main(String[] args) {
        PDFTextParser pDFTextParser = new PDFTextParser();
        pDFTextParser.convertPDFToText("NB Shortcuts.pdf");

        //complex one. Would prefer jasper reports or iText than this for writing
        pDFTextParser.convertTextToPDF("TextToPDF.pdf");
    }
}
Author : Anuj Patel
Blog : http://goldenpackagebyanuj.blogspot.in/