def __init__(self, domains, urls, *args, **kwargs):
"""Constructor for SiteSpider.
Parameters
----------
domains : list
A list of domains for the site.
urls : list
A list of sitemap URLS of the site.
href_xpaths : list
A list of XPATH expression indicating the ancestors of `<a>`
element.
url_regex : string
URL pattern regular expression.
If you use this spider to store item into database, additional
keywords are required:
platform_id : int
The id of a platform instance.
session : object
An instance of SQLAlchemy session.
"""
self.session = kwargs.pop('session', None)
self.platform_id = kwargs.pop('platform_id', None)
self.url_regex = kwargs.pop('url_regex', None)
self.href_xpaths = kwargs.pop('href_xpaths', ())
self.start_urls = urls
self.allowed_domains = domains
self.rules = (Rule(
LinkExtractor(
allow_domains=self.allowed_domains,
restrict_xpaths=self.href_xpaths,
unique=True),
callback="parse_item",
follow=True),)
super(SiteSpider, self).__init__(*args, **kwargs)
评论列表
文章目录