I'm writing a python program to crawl twitter using a combination of urllib2, the python twitter wrapper for the api, and BeautifulSoup. However, when I run my program, I get an error of the following type:
ray_krueger RafaelNadal
Traceback (most recent call last):
File "C:\Users\Public\Documents\Columbia Job\Python Crawler\Twitter Crawler\crawlerversion9.py", line 78, in <module>
crawl(start_follower, output, depth)
File "C:\Users\Public\Documents\Columbia Job\Python Crawler\Twitter Crawler\crawlerversion9.py", line 74, in crawl
crawl(y, output, in_depth - 1)
File "C:\Users\Public\Documents\Columbia Job\Python Crawler\Twitter Crawler\crawlerversion9.py", line 74, in crawl
crawl(y, output, in_depth - 1)
File "C:\Users\Public\Documents\Columbia Job\Python Crawler\Twitter Crawler\crawlerversion9.py", line 64, in crawl
request = urllib2.Request(new_url)
File "C:\Python28\lib\urllib2.py", line 192, in __init__
self.__original = unwrap(url)
File "C:\Python28\lib\urllib.py", line 1038, in unwrap
url = url.strip()
AttributeError: 'NoneType' object has no attribute 'strip'
I'm completely unfamiliar with this type of error (new to python) and searching for it online has yielded very little information. I've attached my code as well, but do you have any suggestions?
Thanx Snehizzy
import twitter
import urllib
import urllib2
import htmllib
from BeautifulSoup import BeautifulSoup
import re
start_follower = "NYTimeskrugman"
depth = 3
output = open(r'C:\Python27\outputtest.txt', 'a') #better to use SQL database thanthis
api = twitter.Api()
#want to also begin entire crawl with some sort of authentication service
def site(follower):
followersite = "http://mobile.twitter.com/" + follower
return followersite
def getPage(follower):
thisfollowersite = site(follower)
request = urllib2.Request(thisfollowersite)
response = urllib2.urlopen(request)
return response
def getSoup(response):
html = response.read()
soup = BeautifulSoup(html)
return soup
def get_more_tweets(soup):
links = soup.findAll('a', {'href': True}, {id : 'more_link'})
for link in links:
b = link.renderContents()
if str(b) == 'more':
c = link['href']
d = 'http://mobile.twitter.com' +c
return d
def recordlinks(soup,output):
tags = soup.findAll('div', {'class' : "list-tweet"})#to obtain tweet of a follower
for tag in tags:
a = tag.renderContents()
b = str (a)
output.write(b)
output.write('\n\n')
def checkforstamp(soup):
times = nsoup.findAll('a', {'href': True}, {'class': 'status_link'})
for time in times:
stamp = time.renderContents()
if str(stamp) == '3 months ago':
return True
def crawl(follower, output, in_depth):
if in_depth > 0:
output.write(follower)
a = getPage(follower)
new_soup = getSoup(a)
recordlinks(new_soup, output)
currenttime = False
while currenttime == False:
new_url = get_more_tweets(new_soup)
request = urllib2.Request(new_url)
response = urllib2.urlopen(request)
new_soup = getSoup(response)
recordlinks(new_soup, output)
currenttime = checkforstamp(new_soup)
users = api.GetFriends(follower)
for u in users[0:5]:
x = u.screen_name
y = str(x)
print y
crawl(y, output, in_depth - 1)
output.write('\n\n')
output.write('\n\n\n')
crawl(start_follower, output, depth)
print("Program done. Look at output file.")