I'm trying to crawl information from urls in a page and save them in a text file.
I have recieved great help in the question How to get the right source code with Python from the URLs using my web crawler? and I try to use what I have learned in BeautifulSoup to finish my codes based on that question.
But when I look at my codes, although they have satisfied my need, but they look pretty messed up. Can anyone help me to optimize them a little, especially on the BeautifulSoup part? Such as the infoLists part and the saveInfo part. Thanks!
Here are my codes:
import requests
from bs4 import BeautifulSoup
from urlparse import urljoin
url = 'http://bbs.skykiwi.com/forum.php?mod=forumdisplay&fid=55&typeid=470&sortid=231&filter=typeid&pageNum=1&page=1'
#To get the source code from the url
def getsourse(url):
header = {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 10.0; WOW64; Trident/8.0; Touch)'}
html = requests.get(url, headers=header)
return html.content
#To get all the links in current page
def getallLinksinPage(sourceCode):
soup = BeautifulSoup(sourceCode)
return [a["href"] for a in soup.select("#threadlist a.xst")]
#To save the info in the info.txt file
def saveinfo(infoLists):
f = open('info.txt', 'a')
for each in infoLists:
f.writelines('Job Title: ' + str(each['title'].encode('utf-8')) + '\n')
f.writelines('Company Name: ' + str(each['companyName'].encode('utf-8')) + '\n')
f.writelines('Company Address: ' + str(each['address'].encode('utf-8')) + '\n')
f.writelines('Job Position: ' + str(each['position'].encode('utf-8')) + '\n')
f.writelines('Salary: ' + str(each['salary'].encode('utf-8')) + '\n')
f.writelines('Full/Part time: ' + str(each['jobType'].encode('utf-8')) + '\n')
f.writelines('Company Tel: ' + str(each['tel'].encode('utf-8')) + '\n')
f.writelines('Company Email: ' + str(each['email'].encode('utf-8')) + '\n')
f.writelines('WorkTime: ' + str(each['workTime'].encode('utf-8')) + '\n\n')
f.close()
sourceCode = getsourse(url) # source code of the url page
allLinksinPage = getallLinksinPage(sourceCode) #a List of the urls in current page
linkNum=1
infoLists=[]
for eachLink in allLinksinPage:
print('Now downloading link '+str(linkNum))
url = 'http://bbs.skykiwi.com/'
realUrl=urljoin(url, eachLink)
html = getsourse(realUrl)
soup= BeautifulSoup(html)
infoList={} #To save the following info,such as title companyName etc
infoList['title']=soup.find(attrs={'id':'thread_subject'}).string
infoList2=[] #To temporarily save info except 'title'
#FROM HERE IT GETS MESSY...
for line in soup.find_all(attrs={'class':'typeoption'}): # first locate the bigClass
for td in line.find_all('td'): # then locate all the 'td's
infoList2.append(td.string)
try:
for eachInfo in infoList2:
infoList['companyName'] = infoList2[0]
infoList['address'] = infoList2[1]
infoList['position'] = infoList2[2]
infoList['salary'] = infoList2[3]
infoList['jobType'] = infoList2[4]
infoList['tel'] = infoList2[5]
infoList['email'] = infoList2[6]
infoList['workTime'] = infoList2[7]
finally:
linkNum += 1 # To print link number
infoLists.append(infoList)
saveinfo(infoLists)