def start_requests(self):
"""This function generates the initial request of ArchiveSpider.
See 'http://doc.scrapy.org/en/latest/topics/spiders.html#\
scrapy.spiders.Spider.start_requests'.
The most import part of the function is to set a request meta,
'archive_meta', according to its site 'archive_rules'. The meta would
be used to parse article URLs from response and generate next request!
"""
for page in self.page_templates:
url = page.format(p_num=self.p_kw['start'])
meta = dict(archive_meta=dict(
last_urls=dict(),
p_num=self.p_kw['start'],
next_tries=0,
max_next_tries=self.p_kw['max_next_tries'],
page=page))
logger.debug('Page format meta info:\n%s', pprint.pformat(meta))
yield scrapy.Request(url, callback=self.parse, meta=meta)
python类spiders()的实例源码
def __init__(self, domains, urls, *args, **kwargs):
"""Constructor for FeedSpider.
Parameters
----------
domains : list
A list of domains for the site.
urls : list
A list of feed URLs of the site.
provider : string
The provider of RSS feed.
url_regex : string
URL pattern regular expression.
If you use this spider to store item into database, additional
keywords are required:
platform_id : int
The id of a platform instance.
session : object
An instance of SQLAlchemy session.
Other keywords are used to specify how to parse the XML, see
http://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders\
.XMLFeedSpider.
"""
self.platform_id = kwargs.pop('platform_id', None)
self.session = kwargs.pop('session', None)
self.url_regex = kwargs.pop('url_regex', None)
self.provider = kwargs.pop('provider', 'self')
self.iterator = kwargs.pop('iterator', 'iternodes')
self.itertag = kwargs.pop('iterator', 'item')
self.allowed_domains = domains
self.start_urls = urls
super(FeedSpider, self).__init__(*args, **kwargs)
def parse(self, response):
item = DoubanTopMoviesItem()
item['title_ch'] = response.xpath('//div[@class="hd"]//span[@class="title"][1]/text()').extract()
# ???title-title-other ??3?????????title-other????????????????
# en_list = response.xpath('//div[@class="hd"]//span[@class="title"][2]/text()').extract()
# item['title_en'] = [en.replace('\xa0/\xa0','').replace(' ','') for en in en_list]
# ht_list = response.xpath('//div[@class="hd"]//span[@class="other"]/text()').extract()
# item['title_ht'] = [ht.replace('\xa0/\xa0','').replace(' ','') for ht in ht_list]
# detail_list = response.xpath('//div[@class="bd"]/p[1]/text()').extract()
# item['detail'] = [detail.replace(' ', '').replace('\xa0', '').replace('\n', '') for detail in detail_list]
# ?????????quote??????????
# item['quote'] = response.xpath('//span[@class="inq"]/text()').extract()
item['rating_num'] = response.xpath('//div[@class="star"]/span[2]/text()').extract()
# ??????“XXX???”???????????XXX??
count_list = response.xpath('//div[@class="star"]/span[4]/text()').extract()
item['rating_count'] = [re.findall('\d+',count)[0] for count in count_list]
item['image_urls'] = response.xpath('//div[@class="pic"]/a/img/@src').extract()
item['topid'] = response.xpath('//div[@class="pic"]/em/text()').extract()
yield item
# ???????????
# new_url = response.xpath('//link[@rel="next"]/@href').extract_first()
# if new_url:
# next_url = self.base_url+new_url
# yield scrapy.Request(next_url, callback=self.parse)
######-------??start_urls?LinkExtractor ???????--------#####
# from scrapy.spiders import CrawlSpider, Rule
# from scrapy.linkextractors import LinkExtractor
# class SpDoubanSpider(CrawlSpider):
# ?
# ?????????????????
# rules = [Rule(LinkExtractor(allow=(r'https://movie.douban.com/top250\?start=\d+.*')),
# callback='parse_item', follow=True)
# ]
# def parse_item(self, response):
# # item ??????
# yield item
######-------??start_urls?LinkExtractor ???????--------#####