What I'm trying to do is extract the image associated with some text in a PDF file. For instance, a PDF would have a photo of the front of a house. Just above the photo, there would be a caption which reads "Front View". I want the program to search the PDF for the text "Front View" and extract the photo that follows it.
I've looked iTextSharp, PDFsharp, and other utilities, but all of them treat the text in a PDF and the images separately. There doesn't seem to be any way to figure out that this line of text comes before that image.
We use iTextSharp for manipulating PDFs. I've written a method in C# that will extract an image given a page number, the number of the image on the page, and the image type. For instance, I can extract the 2nd jpeg on page 3. Here is the code for that. What I would like is to be able to search for a line of text in the file and then extract the image that follows that line of text.
public class ImageExtractor : IRenderListener
{
int _currentPage = 1;
int _imageCount = 0;
int _index = 0;
int _count = 0;
readonly string _outputFilePrefix;
readonly string _outputFolder;
readonly bool _overwriteExistingFiles;
string[] _fileTypes;
public ImageExtractor(string outputFilePrefix, string outputFolder, bool overwriteExistingFiles, string[] fileTypes, int index)
{
_outputFilePrefix = outputFilePrefix;
_outputFolder = outputFolder;
_overwriteExistingFiles = overwriteExistingFiles;
_fileTypes = fileTypes;
_index = index;
}
public static int ExtractImageByIndex(string pdfPath, string outputFilePrefix, string outputFolder, bool overwriteExistingFiles, int pageNumber, int index, string[] fileTypes = null)
{
// Handle setting of any default values
outputFilePrefix = outputFilePrefix ?? System.IO.Path.GetFileNameWithoutExtension(pdfPath);
outputFolder = String.IsNullOrEmpty(outputFolder) ? System.IO.Path.GetDirectoryName(pdfPath) : outputFolder;
var instance = new ImageExtractor(outputFilePrefix, outputFolder, overwriteExistingFiles, fileTypes, index);
instance._currentPage = pageNumber;
using (var pdfReader = new PdfReader(pdfPath))
{
if (pdfReader.NumberOfPages == 0)
return 0;
if (pdfReader.IsEncrypted())
throw new ApplicationException(pdfPath + " is encrypted.");
var pdfParser = new PdfReaderContentParser(pdfReader);
pdfParser.ProcessContent(instance._currentPage, instance);
}
return instance._imageCount;
}
public void BeginTextBlock() { }
public void EndTextBlock() { }
public void RenderText(TextRenderInfo renderInfo) { }
public void RenderImage(ImageRenderInfo renderInfo)
{
// If _index is greater than 0, we're looking for a specific image. If _count is
// equal to _index, we've already found it, so don't go any farther.
if (_index > 0 && _count == _index)
return;
var imageObject = renderInfo.GetImage();
var imageFileName = "";
if (_fileTypes != null)
{
var type = imageObject.GetFileType().ToLower();
var flag = false;
foreach (var t in _fileTypes)
{
if (t.ToLower() == type)
{
flag = true;
break;
}
}
if (flag)
imageFileName = String.Format("{0}_{1}_{2}.{3}", _outputFilePrefix, _currentPage, _imageCount, imageObject.GetFileType());
}
else
{
imageFileName = String.Format("{0}_{1}_{2}.{3}", _outputFilePrefix, _currentPage, _imageCount, imageObject.GetFileType());
}
if (!string.IsNullOrEmpty(imageFileName))
{
// If _index is 0, multiple images may be extracted. If _index is greater than 0,
// RenderImage will increment count every time it finds an image that matches the
// file type and will only extract the image if count equals index.
if (_index > 0)
{
_count++;
if (_count != _index)
return;
}
var imagePath = System.IO.Path.Combine(_outputFolder, imageFileName);
if (_overwriteExistingFiles || !File.Exists(imagePath))
{
var imageRawBytes = imageObject.GetImageAsBytes();
File.WriteAllBytes(imagePath, imageRawBytes);
}
// Subtle: Always increment even if file is not written. This ensures consistency should only some
// of a PDF file's images actually exist.
_imageCount++;
}
}
}