I have searched in many places but I haven't come across a logic/script that extracts the URLs from the emails properly. So, I am presenting what I came up with. This is working perfectly for me.
This can handle plain-text and html-text content-types, supports quoted-printable, base64, and 7bit encodings.
NOTE: I wrote this as part of another task, you may have to tweak it to suit your need. Post any questions, and I can help answer.
Modules to import for this to work:
import traceback
import BeautifulSoup
import re
from sets import Set
import email
import quopri, base64
Here are the APIs I wrote that will do this job:
def decode_quote_printable_part(self, quo_pri_part):
"""
Decodes a quote-printable encoded MIME object
:param quo_pri_part: MIME msg part
:return: decoded text, null if exception
"""
try:
quo_pri_payload = quo_pri_part.get_payload()
return quopri.decodestring(quo_pri_payload)
except Exception as err:
print "ERROR - Exception when decoding quoted printable: %s" % err
return ""
def decode_base64_part(self, base64_part):
"""
Decodes base64 encoded MIME object
:param base64_part: MIME msg part
:return: decoded text, null if exception
"""
try:
decoded_part = base64.b64decode(base64_part)
return decoded_part
except Exception as err:
print "ERROR - Exception when decoding base64 part: %s" % err
return ""
def get_urls_from_html_part(self, html_code):
"""
Parses the given HTML text and extracts the href links from it.
The input should already be decoded
:param html_code: Decoded html text
:return: A list of href links (includes mailto: links as well), null list if exception
"""
try:
soup = BeautifulSoup.BeautifulSoup(html_code)
html_urls = []
for link in soup.findAll("a"):
url = link.get("href")
if url and "http" in url:
html_urls.append(url)
return html_urls
except Exception as err:
print "ERROR - Exception when parsing the html body: %s" % err
return []
def get_urls_from_plain_part(self, email_data):
"""
Parses the given plain text and extracts the URLs out of it
:param email_data: plain text to parse
:return: A list of URLs (deduplicated), a null list if exception
"""
try:
pattern = "abcdefghijklmnopqrstuvwxyz0123456789./\~#%&()_-+=;?:[]!$*,@'^`<{|\""
indices = [m.start() for m in re.finditer('http://', email_data)]
indices.extend([n.start() for n in re.finditer('https://', email_data)])
urls = []
if indices:
if len(indices) > 1:
new_lst = zip(indices, indices[1:])
for x, y in new_lst:
tmp = email_data[x:y]
url = ""
for ch in tmp:
if ch.lower() in pattern:
url += ch
else:
break
urls.append(url)
tmp = email_data[indices[-1]:]
url = ""
for ch in tmp:
if ch.lower() in pattern:
url += ch
else:
break
urls.append(url)
urls = list(Set(urls))
return urls
return []
except Exception as err:
print "ERROR - Exception when parsing plain text for urls: %s" % err
return []
def get_urls_list(self, msg):
"""
Collects all the URLs from an email
:param msg: email message object
:return: A dictionary of URLs => final_urls = {'http': [], 'https': []}
"""
urls = []
for part in msg.walk():
decoded_part = part.get_payload()
if part.__getitem__("Content-Transfer-Encoding") == "quoted-printable":
decoded_part = self.decode_quote_printable_part(part)
elif part.__getitem__("Content-Transfer-Encoding") == "base64":
decoded_part = self.decode_base64_part(part.get_payload())
if part.get_content_subtype() == "plain":
urls.extend(self.get_urls_from_plain_part(decoded_part))
elif part.get_content_subtype() == "html":
urls.extend(self.get_urls_from_html_part(decoded_part))
final_urls = {'http': [], 'https': []}
for url in urls:
if "http://" in url:
final_urls['http'].append(url)
else:
final_urls['https'].append(url)
return final_urls
Here is how to call this API:
try:
with open(filename, 'r') as f:
data = f.read()
msg = email.message_from_string(data)
final_urls = self.get_urls_list(msg)
except:
pass