I've written some code in python to parse title and link from a webpage. Initially, I tried to parse the links from the left sided bar then scrape those aforesaid documents from each page by tracking down each links. I did this flawlessly. I tried to save the documents of different links in different pages in a single excel file. However, It creates several "Sheets" extracting the desired portion as the sheet name from heading variable from my script. The problem I'm facing is- when the data are saved, only the last record of each page from the links are saved in my excel sheets instead of the full records. Here is the script I tried with:
import requests
from lxml import html
from pyexcel_ods3 import save_data
web_link = "http://www.wiseowl.co.uk/videos/"
main_url = "http://www.wiseowl.co.uk"
def get_links(page):
response = requests.Session().get(page)
tree = html.fromstring(response.text)
data = {}
titles = tree.xpath("//ul[@class='woMenuList']//li[@class='woMenuItem']/a/@href")
for title in titles:
if "author" not in title and "year" not in title:
get_docs(data, main_url + title)
def get_docs(data, url):
response = requests.Session().get(url)
tree = html.fromstring(response.text)
heading = tree.findtext('.//h1[@class="gamma"]')
for item in tree.xpath("//p[@class='woVideoListDefaultSeriesTitle']"):
title = item.findtext('.//a')
link = item.xpath('.//a/@href')[0]
# print(title, link)
data.update({heading.split(" ")[-4]: [[(title)]]})
save_data("mth.ods", data)
if __name__ == '__main__':
get_links(web_link)