I'm playing around a little bit with dataset.readthedocs.io which is an API on top of SQLAlchemy, but I'm not able to generically create the tables in an easy way. I know that it's possible to write create table statements and connect to the table, but I'm wondering if there is a more convenient way.
Example code below:
Pipeline code:
class DBWritePipeline(object):
def __init__(self, dataset_uri, dataset_table):
self.dataset_uri = dataset_uri
self.dataset_table = dataset_table
@classmethod
def from_crawler(cls, crawler):
return cls(
dataset_uri=crawler.settings.get('CONNECTION_STRING'),
dataset_table=crawler.settings.get('DATASET_TABLE', 'itabone')
)
def open_spider(self, spider):
self.db = dataset.connect(self.dataset_uri)
def close_spider(self, spider):
pass
def process_item(self, item, spider):
if hasattr(item,"key"):
self.db[item.table].upsert(item,item.key)
else:
self.db[item.table].insert(item)
#self.db[self.dataset_table].insert(item)
return item
settings:
# Database settings
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'scrapy'
MYSQL_PASSWORD = 'scrapy'
MYSQL_DB = 'scrapy'
MYSQL_UPSERT = True
CONNECTION_STRING = "{drivername}://{user}:{passwd}@{host}:{port}/{db_name}".format(
drivername="mysql",
user=MYSQL_USER,
passwd=MYSQL_PASSWORD,
host=MYSQL_HOST,
port=MYSQL_PORT,
db_name=MYSQL_DB,)
example items:
class Itabone(scrapy.Item):
name = scrapy.Field(type="String(32)")
val = scrapy.Field()
table = "itabone"
key = "name"
class Itabthree(scrapy.Item):
name = scrapy.Field()
val = scrapy.Field()
table = "itabthree"
class Itabtwo(scrapy.Item):
name = scrapy.Field(max_length=30,key=True)
val = scrapy.Field()
table = "itabtwo"
key = ["name","val"]
Spider:
import scrapy
from myproject.items import Itabone
from myproject.items import Itabtwo
from myproject.items import Itabthree
class ItemTest(scrapy.Spider):
name = 'itempiptest'
start_urls = ['https://www.example.com']
def parse(self, response):
i1 = Itabone(name='tsts',val=1)
i2 = Itabtwo(name='vcvcv',val=2)
i3 = Itabthree(name='vcvcv',val=2)
yield i1
yield i2
yield i3
What should I write in the item.py so the tables are created and updated correctly?