Is there a better approach to use BeautifulSoup in

2019-06-06 09:50发布

问题:

I'm trying to crawl information from urls in a page and save them in a text file.

I have recieved great help in the question How to get the right source code with Python from the URLs using my web crawler? and I try to use what I have learned in BeautifulSoup to finish my codes based on that question.

But when I look at my codes, although they have satisfied my need, but they look pretty messed up. Can anyone help me to optimize them a little, especially on the BeautifulSoup part? Such as the infoLists part and the saveInfo part. Thanks!

Here are my codes:

import requests
from bs4 import BeautifulSoup
from urlparse import urljoin

url = 'http://bbs.skykiwi.com/forum.php?mod=forumdisplay&fid=55&typeid=470&sortid=231&filter=typeid&pageNum=1&page=1'

#To get the source code from the url
def getsourse(url):
    header = {'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows  NT 10.0; WOW64; Trident/8.0; Touch)'}
    html = requests.get(url, headers=header)
    return html.content

#To get all the links in current page
def getallLinksinPage(sourceCode):
    soup = BeautifulSoup(sourceCode)
    return [a["href"] for a in soup.select("#threadlist a.xst")]

#To save the info in the info.txt file
def saveinfo(infoLists):
    f = open('info.txt', 'a')
    for each in infoLists:
        f.writelines('Job Title: ' + str(each['title'].encode('utf-8')) + '\n')
        f.writelines('Company Name: ' + str(each['companyName'].encode('utf-8')) + '\n')
        f.writelines('Company Address: ' + str(each['address'].encode('utf-8')) + '\n')
        f.writelines('Job Position: ' + str(each['position'].encode('utf-8')) + '\n')
        f.writelines('Salary: ' + str(each['salary'].encode('utf-8')) + '\n')
        f.writelines('Full/Part time: ' + str(each['jobType'].encode('utf-8')) + '\n')
        f.writelines('Company Tel: ' + str(each['tel'].encode('utf-8')) + '\n')
        f.writelines('Company Email: ' + str(each['email'].encode('utf-8')) + '\n')
        f.writelines('WorkTime: ' + str(each['workTime'].encode('utf-8')) + '\n\n')
    f.close()

sourceCode = getsourse(url) # source code of the url page
allLinksinPage = getallLinksinPage(sourceCode) #a List of the urls in current page
linkNum=1
infoLists=[]
for eachLink in allLinksinPage:
    print('Now downloading link '+str(linkNum))
    url = 'http://bbs.skykiwi.com/'
    realUrl=urljoin(url, eachLink)
    html = getsourse(realUrl)
    soup= BeautifulSoup(html)
    infoList={} #To save the following info,such as title companyName etc
    infoList['title']=soup.find(attrs={'id':'thread_subject'}).string
    infoList2=[] #To temporarily save info except 'title'
    #FROM HERE IT GETS MESSY...
    for line in soup.find_all(attrs={'class':'typeoption'}): # first locate the bigClass
        for td in line.find_all('td'):  # then locate all the 'td's
            infoList2.append(td.string)
        try:
            for eachInfo in infoList2:
                infoList['companyName'] = infoList2[0]
                infoList['address'] = infoList2[1]
                infoList['position'] = infoList2[2]
                infoList['salary'] = infoList2[3]
                infoList['jobType'] = infoList2[4]
                infoList['tel'] = infoList2[5]
                infoList['email'] = infoList2[6]
                infoList['workTime'] = infoList2[7]
        finally:
            linkNum += 1 # To print link number
    infoLists.append(infoList)

saveinfo(infoLists)

回答1:

Using zip() and a list comprehension would dramatically improve readability:

headers = ['companyName', 'address', 'position', 'salary', 'jobType', 'tel', 'email', 'workTime']

infoLists = [dict(zip(headers, [item.string for item in line.find_all('td')[:8]])) 
             for line in soup.select(".typeoption")]