I'm creating a script to download some mp3 podcasts from a site and write them to a certain location. I'm nearly finished, and the files are being downloaded and created. However, I'm running into a problem where the binary data can't be fully decoded and the mp3 files won't play.
Here's my code:
import re
import os
import urllib2
from bs4 import BeautifulSoup
import time
def getHTMLstring(url):
html = urllib2.urlopen(url)
soup = BeautifulSoup(html)
soupString = soup.encode('utf-8')
return soupString
def getList(html_string):
urlList = re.findall('(http://podcast\.travelsinamathematicalworld\.co\.uk\/mp3/.*\.mp3)', html_string)
firstUrl = urlList[0]
finalList = [firstUrl]
for url in urlList:
if url != finalList[0]:
finalList.insert(0,url)
return finalList
def getBinary(netLocation):
req = urllib2.urlopen(netLocation)
reqSoup = BeautifulSoup(req)
reqString = reqSoup.encode('utf-8')
return reqString
def getFilename(string):
splitTerms = string.split('/')
fileName = splitTerms[-1]
return fileName
def writeFile(sourceBinary, fileName):
with open(fileName, 'wb') as fp:
fp.write(sourceBinary)
def main():
htmlString = getHTMLstring('http://www.travelsinamathematicalworld.co.uk')
urlList = getList(htmlString)
fileFolder = 'D:\\Dropbox\\Mathematics\\Travels in a Mathematical World\\Podcasts'
os.chdir(fileFolder)
for url in urlList:
name = getFilename(url)
binary = getBinary(url)
writeFile(binary, name)
time.sleep(2)
if __name__ == '__main__':
main()
When I run the code, I get the following warning in my console:
WARNING:root:Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.
I'm thinking that it has to do with the fact that the data that I'm using is encoded in UTF-8, and maybe the write method expects a different encoding? I'm new to Python (and really to programming in general), and I'm stuck.