So I'm trying to get a specific bit of text out of some PDFs, and I'm using Python with PDFMiner but having some trouble due to the API changes to it that happened in November 2013. Basically, to get the part of text I want out of the PDF, I currently have to convert the entire file to text, and then use string functions to get the part I want. What I want to do is loop through each page of the PDF and convert each one to text, one by one. Then once I've found the part I want, I'll just stop it from reading that PDF.
I'll post the code that's sitting in my text editor atm, but it's not the working version, it's more the half-way-to-the-efficient-solution version :P
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.converter import LTChar, TextConverter
from pdfminer.layout import LAParams
from subprocess import call
from cStringIO import StringIO
import re
import sys
import os
argNum = len(sys.argv)
pdfLoc = str(sys.argv[1]) #CLI arguments
def convert_pdf_to_txt(path): #converts pdf to raw text (not my function)
rsrcmgr = PDFResourceManager()
retstr = StringIO()
codec = 'utf-8'
laparams = LAParams()
device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams)
fp = file(path, 'rb')
interpreter = PDFPageInterpreter(rsrcmgr, device)
password = ""
maxpages = 0
caching = True
pagenos=set()
for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
interpreter.process_page(page)
fp.close()
device.close()
str = retstr.getvalue()
retstr.close()
return str
if (pdfLoc[-4:] == ".pdf"):
contents = ""
try: # Get the outlines (contents) of the document
fp = open(pdfLoc, 'rb') #open a pdf document for reading
parser = PDFParser(fp)
document = PDFDocument(parser)
outlines = document.get_outlines()
for (level,title,dest,a,se) in outlines:
title = re.sub(r".*\s", "", title) #get raw titles, stripped of formatting
contents += title + "\n"
except: #if pdfMiner can't get contents then manually get contents from text conversion
#contents = convert_pdf_to_txt(pdfLoc)
#startToCpos = contents.find("TABLE OF CONTENTS")
#endToCpos = contents.rfind(". . .")
#contents = contents[startToCpos:endToCpos+8]
fp = open(pdfLoc, 'rb') #open a pdf document for reading
parser = PDFParser(fp)
document = PDFDocument(parser)
pages = PDFPage(document, 3, {'Resources':'thing', 'MediaBox':'Thing'}) #God knows what's going on here
for pageNumber, page in enumerate(pages.get_pages(PDFDocument, fp)): #The hell is the first argument?
if pageNumber == 42:
print "Hello"
#for line in s:
# print line
# if (re.search("(\.\s){2,}", line) and not re.search("NOTES|SCOPE", line)):
# line = re.sub("(\.\s){2,}", "", line)
# line = re.sub("(\s?)*[0-9]*\n", "\n", line)
# line = re.sub("^\s", "", line)
# print line,
#contents = contents.lower()
#contents = re.sub("“", "\"", contents)
#contents = re.sub("”", "\"", contents)
#contents = re.sub("fi", "f", contents)
#contents = re.sub(r"(TABLE OF CONTENTS|LIST OF TABLES|SCOPE|REFERENCED DOCUMENTS|Identification|System (o|O)verview|Document (o|O)verview|Title|Page|Table|Tab)(\n)?|\.\s?|Section|[0-9]", "", contents)
#contents = re.sub(r"This document contains proprietary information and may not be reproduced in any form whatsoever, nor may be used by or its contents divulged to third\nparties without written permission from the ownerAll rights reservedNumber: STP SMEDate: -Jul-Issue: A of CMC STPNHIndustriesCLASSIFICATION\nNATO UNCLASSIFIED AGUSTAEUROCOPTEREUROCOPTER DEUTSCHLAND FOKKER", "", contents)
#contents = re.sub(r"(\r?\n){2,}", "", contents)
#contents = contents.lstrip()
#contents = contents.rstrip()
#print contents
else:
print "Not a valid PDF file"
This is the old way of doing it (Or at least an idea of how the old way did it, the thread wasn't very useful to me tbh). But now I have to use PDFPage.get_pages instead of PDFDocument.get_pages and the methods and their arguments are completely different.
Currently, I'm trying to figure out what on earth the 'Klass' variable is that I pass to the get_pages method of PDFPage.
If anybody could shed some light on this part of the API or even provide a working example I'd very much appreciate it.