I am trying to get sizes (width and depth) of images embedded in a PDF file. The images in the PDF are all high resolution vector images.
- I tried using PDFBox. PDFBox libraries extract images perfectly for normal graphics. But, when it gets vector images, it extracts different layers as different images.
- I have also read about iText. But iText can convert the whole page as rasterized image. Whereas, my PDF page is actually consisting multiple images and I need to extract/get size of all of them differently.
I am attaching my PDFBox image extraction code here. Please let me know, how can I get one vectored image as one image and not as layers.
My code is as follows:
package com.abp.pdf.util;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import javax.imageio.ImageIO;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDResources;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectForm;
import org.apache.pdfbox.pdmodel.graphics.xobject.PDXObjectImage;
public class ExtractImages {
private int imageCounter = 1;
private ExtractImages() {
}
public static void main(String[] args) throws Exception {
ExtractImages extractor = new ExtractImages();
extractor.extractImages(args);
}
private void extractImages(String[] args) throws Exception {
String pdfFile = null;
String password = "";
String prefix = null;
boolean addKey = false;
boolean useNonSeqParser = true;
pdfFile = "/home/suvankar/Resources/myfile.pdf";
if (prefix == null && pdfFile.length() > 4) {
prefix = pdfFile.substring(0, pdfFile.lastIndexOf("/") + 1)
+ "extracted/images"
+ pdfFile.substring(pdfFile.lastIndexOf("/"),
pdfFile.length() - 4);
}
PDDocument document = null;
try {
if (useNonSeqParser) {
document = PDDocument.loadNonSeq(new File(pdfFile), null,
password);
} else {
document = PDDocument.load(pdfFile);
if (document.isEncrypted()) {
StandardDecryptionMaterial spm = new StandardDecryptionMaterial(
password);
document.openProtection(spm);
}
}
AccessPermission ap = document.getCurrentAccessPermission();
if (!ap.canExtractContent()) {
throw new IOException(
"Error: You do not have permission to extract images.");
}
List pages = document.getDocumentCatalog().getAllPages();
Iterator iter = pages.iterator();
while (iter.hasNext()) {
PDPage page = (PDPage) iter.next();
PDResources resources = page.getResources();
processResources(resources, prefix, addKey);
}
} finally {
if (document != null) {
document.close();
}
}
}
private void processResources(PDResources resources, String prefix,
boolean addKey) throws IOException {
if (resources == null) {
return;
}
Map<String, PDXObject> xobjects = resources.getXObjects();
if (xobjects != null) {
Iterator<String> xobjectIter = xobjects.keySet().iterator();
while (xobjectIter.hasNext()) {
String key = xobjectIter.next();
PDXObject xobject = xobjects.get(key);
// write the images
if (xobject instanceof PDXObjectImage) {
PDXObjectImage image = (PDXObjectImage) xobject;
String name = null;
if (addKey) {
name = getUniqueFileName(prefix + "_" + key,
image.getSuffix());
} else {
name = getUniqueFileName(prefix, image.getSuffix());
}
System.out.println("Writing image:" + name + "\nHeight - "+ image.getHeight() + "\nWidth - " + image.getWidth());
// name="extracted/images/" + name;
/*BufferedImage ib= image.getRGBImage();
File outputfile = new File(name + "-buffered.jpg");
ImageIO.write(ib, "jpeg", outputfile);*/
image.write2file(name);
}
// maybe there are more images embedded in a form object
else if (xobject instanceof PDXObjectForm) {
PDXObjectForm xObjectForm = (PDXObjectForm) xobject;
PDResources formResources = xObjectForm.getResources();
processResources(formResources, prefix, addKey);
}
}
}
}
private String getUniqueFileName(String prefix, String suffix) {
String uniqueName = null;
File f = null;
while (f == null || f.exists()) {
uniqueName = prefix + "-" + imageCounter;
f = new File(uniqueName + "." + suffix);
imageCounter++;
}
return uniqueName;
}
/**
* This will print the usage requirements and exit.
*/
private static void usage() {
System.err
.println("Usage: java org.apache.pdfbox.ExtractImages [OPTIONS] <PDF file>\n"
+ " -password <password> Password to decrypt document\n"
+ " -prefix <image-prefix> Image prefix(default to pdf name)\n"
+ " -addkey add the internal image key to the file name\n"
+ " -nonSeq Enables the new non-sequential parser\n"
+ " <PDF file> The PDF document to use\n");
System.exit(1);
}
}