Flatten vector graphics inside pdf and extract usi

2019-03-30 17:02发布

问题:

I am trying to get sizes (width and depth) of images embedded in a PDF file. The images in the PDF are all high resolution vector images.

  • I tried using PDFBox. PDFBox libraries extract images perfectly for normal graphics. But, when it gets vector images, it extracts different layers as different images.
  • I have also read about iText. But iText can convert the whole page as rasterized image. Whereas, my PDF page is actually consisting multiple images and I need to extract/get size of all of them differently.

I am attaching my PDFBox image extraction code here. Please let me know, how can I get one vectored image as one image and not as layers.

My code is as follows:

package com.abp.pdf.util;

import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;

public class ExtractImages {

    private int imageCounter = 1;


    private ExtractImages() {
    }
    public static void main(String[] args) throws Exception {
        ExtractImages extractor = new ExtractImages();
        extractor.extractImages(args);
    }

    private void extractImages(String[] args) throws Exception {
        String pdfFile = null;
        String password = "";
        String prefix = null;
        boolean addKey = false;
        boolean useNonSeqParser = true;

        pdfFile = "/home/suvankar/Resources/myfile.pdf";
        if (prefix == null && pdfFile.length() > 4) {
            prefix = pdfFile.substring(0, pdfFile.lastIndexOf("/") + 1)
                    + "extracted/images"
                    + pdfFile.substring(pdfFile.lastIndexOf("/"),
                            pdfFile.length() - 4);
        }
        PDDocument document = null;

        try {
            if (useNonSeqParser) {
                document = PDDocument.loadNonSeq(new File(pdfFile), null,
                        password);
            } else {
                document = PDDocument.load(pdfFile);

                if (document.isEncrypted()) {
                    StandardDecryptionMaterial spm = new StandardDecryptionMaterial(
                            password);
                    document.openProtection(spm);
                }
            }
            AccessPermission ap = document.getCurrentAccessPermission();
            if (!ap.canExtractContent()) {
                throw new IOException(
                        "Error: You do not have permission to extract images.");
            }

            List pages = document.getDocumentCatalog().getAllPages();
            Iterator iter = pages.iterator();
            while (iter.hasNext()) {
                PDPage page = (PDPage) iter.next();
                PDResources resources = page.getResources();
                processResources(resources, prefix, addKey);
            }
        } finally {
            if (document != null) {
                document.close();
            }
        }
    }


    private void processResources(PDResources resources, String prefix,
            boolean addKey) throws IOException {
        if (resources == null) {
            return;
        }
        Map<String, PDXObject> xobjects = resources.getXObjects();
        if (xobjects != null) {
            Iterator<String> xobjectIter = xobjects.keySet().iterator();
            while (xobjectIter.hasNext()) {
                String key = xobjectIter.next();
                PDXObject xobject = xobjects.get(key);
                // write the images
                if (xobject instanceof PDXObjectImage) {
                    PDXObjectImage image = (PDXObjectImage) xobject;
                    String name = null;

                    if (addKey) {
                        name = getUniqueFileName(prefix + "_" + key,
                                image.getSuffix());
                    } else {
                        name = getUniqueFileName(prefix, image.getSuffix());
                    }
                    System.out.println("Writing image:" + name + "\nHeight - "+ image.getHeight() + "\nWidth - " + image.getWidth());
                    // name="extracted/images/" + name;
                    /*BufferedImage ib= image.getRGBImage();
                    File outputfile = new File(name + "-buffered.jpg");
                    ImageIO.write(ib, "jpeg", outputfile);*/ 
                    image.write2file(name);
                }
                // maybe there are more images embedded in a form object
                else if (xobject instanceof PDXObjectForm) {
                    PDXObjectForm xObjectForm = (PDXObjectForm) xobject;
                    PDResources formResources = xObjectForm.getResources();
                    processResources(formResources, prefix, addKey);
                }
            }
        }
    }

    private String getUniqueFileName(String prefix, String suffix) {
        String uniqueName = null;
        File f = null;
        while (f == null || f.exists()) {
            uniqueName = prefix + "-" + imageCounter;
            f = new File(uniqueName + "." + suffix);
            imageCounter++;
        }
        return uniqueName;
    }

    /**
     * This will print the usage requirements and exit.
     */
    private static void usage() {
        System.err
                .println("Usage: java org.apache.pdfbox.ExtractImages [OPTIONS] <PDF file>\n"
                        + "  -password  <password>        Password to decrypt document\n"
                        + "  -prefix  <image-prefix>      Image prefix(default to pdf name)\n"
                        + "  -addkey                      add the internal image key to the file name\n"
                        + "  -nonSeq                      Enables the new non-sequential parser\n"
                        + "  <PDF file>                   The PDF document to use\n");
        System.exit(1);
    }

}