def __init__(self,rule):
dispatcher.connect(self.spider_opened, signals.spider_opened)
dispatcher.connect(self.spider_closed, signals.spider_closed)
self.rule = rule
self.name = rule.name
self.allowed_domains = rule.allowed_domains.split(',')
self.start_urls = rule.start_urls.split(',')
rule_list = []
# ??`???`???
if len(rule.next_page):
rule_list.append(Rule(LinkExtractor(restrict_xpaths=rule.next_page), follow=True))
rule_list.append(Rule(LinkExtractor(
allow=rule.allow_url.split(','),
unique=True),
follow=True,
callback='parse_item'))
self.rules = tuple(rule_list)
super(ProxySpiderSpider, self).__init__()
python类Rule()的实例源码
newspaper_crawler.py 文件源码
项目:newspaper-scraper-couchbase
作者: aleonsan
项目源码
文件源码
阅读 19
收藏 0
点赞 0
评论 0
def __init__(self, topic=None, newspaper=None, term='', *args, **kwargs):
self.term = term
if newspaper:
sources = [source for source in SOURCE_NEWSPAPERS if newspaper == source['name']]
else:
sources = TOPIC_TO_SOURCES.get(topic, SOURCE_NEWSPAPERS)
self.allowed_domains = [source['allowed_domains'] for source in sources]
self.start_urls = [source['url'] for source in sources]
self.rules = []
for source in sources:
if topic:
allowed_domain_regex=(source['allowed_subdomains_regex'][topic], )
else:
allowed_domain_regex = (regexsubdomain for topic, regexsubdomain in source['allowed_subdomains_regex'].items())
rule = Rule(link_extractor=LinkExtractor(allow=allowed_domain_regex),
callback='parse_with_term',
cb_kwargs={
'term': self.term,
'newspaper': newspaper
},
follow=True)
self.rules.append(rule)
return super(NewspaperCrawler, self).__init__(*args, **kwargs)
def __init__(self, *args, **kwargs):
self.rules = [Rule(self.get_link_extractor(),
callback=self.parse_item,
process_links=self.limit_links,
follow=True)]
super(WebSpider, self).__init__(*args, **kwargs)
target_sites = settings.get('TARGET_SITES')
if target_sites and os.path.isfile(target_sites):
# Read a list of URLs from file
# Create the target file list
with open(target_sites) as target_sites_file:
# Make it to Python list
self.start_urls = target_sites_file.read().splitlines()
# Remove empty strings
self.start_urls = [u for u in self.start_urls if u]
else:
self.start_urls = self.default_start_url
def __init__(self, forum_id=58, digit=1, *args, **kwargs):
self.start_urls = [self.ip_format % d for d in [int(forum_id)]]
self.rules = [Rule(sle(allow=("/forum/forum-" + str(forum_id) + "-[0-9]{," + str(digit) + "}\.html")), follow=True, callback='parse_1'),]
super(sisSpider, self).__init__(*args, **kwargs)
def __init__(self, *args, **kwargs):
self.rules = (
spiders.Rule(SameBaseDomainLinkExtractor(allowed_domains=self.allowed_domains), callback=self._parse_contents, follow=True),
)
logging.getLogger('scrapy.core.engine').setLevel(logging.INFO)
logging.getLogger('scrapy.downloadermiddlewares.redirect').setLevel(logging.INFO)
logging.getLogger('scrapy.spidermiddlewares.depth').setLevel(logging.INFO)
# We must set up self.rules before calling super, since super calls _compile_rules().
super(AllStudiosScraper, self).__init__(*args, **kwargs)
def __init__(self, domains, urls, *args, **kwargs):
"""Constructor for SiteSpider.
Parameters
----------
domains : list
A list of domains for the site.
urls : list
A list of sitemap URLS of the site.
href_xpaths : list
A list of XPATH expression indicating the ancestors of `<a>`
element.
url_regex : string
URL pattern regular expression.
If you use this spider to store item into database, additional
keywords are required:
platform_id : int
The id of a platform instance.
session : object
An instance of SQLAlchemy session.
"""
self.session = kwargs.pop('session', None)
self.platform_id = kwargs.pop('platform_id', None)
self.url_regex = kwargs.pop('url_regex', None)
self.href_xpaths = kwargs.pop('href_xpaths', ())
self.start_urls = urls
self.allowed_domains = domains
self.rules = (Rule(
LinkExtractor(
allow_domains=self.allowed_domains,
restrict_xpaths=self.href_xpaths,
unique=True),
callback="parse_item",
follow=True),)
super(SiteSpider, self).__init__(*args, **kwargs)