python类spiders()的实例源码-面圈网

url.py 文件源码项目：hoaxy-backend 作者: IUNetSci 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def start_requests(self):
        """This function generates the initial request of ArchiveSpider.

        See 'http://doc.scrapy.org/en/latest/topics/spiders.html#\
        scrapy.spiders.Spider.start_requests'.

        The most import part of the function is to set a request meta,
        'archive_meta', according to its site 'archive_rules'. The meta would
        be used to parse article URLs from response and generate next request!
        """
        for page in self.page_templates:
            url = page.format(p_num=self.p_kw['start'])
            meta = dict(archive_meta=dict(
                last_urls=dict(),
                p_num=self.p_kw['start'],
                next_tries=0,
                max_next_tries=self.p_kw['max_next_tries'],
                page=page))
            logger.debug('Page format meta info:\n%s', pprint.pformat(meta))
            yield scrapy.Request(url, callback=self.parse, meta=meta)

url.py 文件源码项目：hoaxy-backend 作者: IUNetSci 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def __init__(self, domains, urls, *args, **kwargs):
        """Constructor for FeedSpider.

        Parameters
        ----------
        domains : list
            A list of domains for the site.
        urls : list
            A list of feed URLs of the site.
        provider : string
            The provider of RSS feed.
        url_regex : string
            URL pattern regular expression.

        If you use this spider to store item into database, additional
        keywords are required:

        platform_id : int
            The id of a platform instance.
        session : object
            An instance of SQLAlchemy session.

        Other keywords are used to specify how to parse the XML, see
        http://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders\
        .XMLFeedSpider.
        """
        self.platform_id = kwargs.pop('platform_id', None)
        self.session = kwargs.pop('session', None)
        self.url_regex = kwargs.pop('url_regex', None)
        self.provider = kwargs.pop('provider', 'self')
        self.iterator = kwargs.pop('iterator', 'iternodes')
        self.itertag = kwargs.pop('iterator', 'item')
        self.allowed_domains = domains
        self.start_urls = urls
        super(FeedSpider, self).__init__(*args, **kwargs)

sp_douban.py 文件源码项目：ScrapyProject 作者: chinesehuazhou 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def parse(self, response):
        item = DoubanTopMoviesItem()
        item['title_ch'] = response.xpath('//div[@class="hd"]//span[@class="title"][1]/text()').extract()

        # ???title-title-other ??3?????????title-other????????????????
        # en_list = response.xpath('//div[@class="hd"]//span[@class="title"][2]/text()').extract()
        # item['title_en'] = [en.replace('\xa0/\xa0','').replace('  ','') for en in en_list]
        # ht_list = response.xpath('//div[@class="hd"]//span[@class="other"]/text()').extract()
        # item['title_ht'] = [ht.replace('\xa0/\xa0','').replace('  ','') for ht in ht_list]
        # detail_list = response.xpath('//div[@class="bd"]/p[1]/text()').extract()
        # item['detail'] = [detail.replace('  ', '').replace('\xa0', '').replace('\n', '') for detail in detail_list]
        # ?????????quote??????????
        # item['quote'] = response.xpath('//span[@class="inq"]/text()').extract()

        item['rating_num'] = response.xpath('//div[@class="star"]/span[2]/text()').extract()
        # ??????“XXX???”???????????XXX??
        count_list = response.xpath('//div[@class="star"]/span[4]/text()').extract()
        item['rating_count'] = [re.findall('\d+',count)[0] for count in count_list]
        item['image_urls'] = response.xpath('//div[@class="pic"]/a/img/@src').extract()
        item['topid'] = response.xpath('//div[@class="pic"]/em/text()').extract()

        yield item

        # ???????????
        # new_url = response.xpath('//link[@rel="next"]/@href').extract_first()
        # if new_url:
        #     next_url = self.base_url+new_url
        #     yield scrapy.Request(next_url, callback=self.parse)


######-------??start_urls?LinkExtractor ???????--------#####
    # from scrapy.spiders import CrawlSpider, Rule
    # from scrapy.linkextractors import LinkExtractor
    # class SpDoubanSpider(CrawlSpider):
        # ?
    # ?????????????????
    # rules = [Rule(LinkExtractor(allow=(r'https://movie.douban.com/top250\?start=\d+.*')),
    #                callback='parse_item', follow=True)
    #           ]
    # def parse_item(self, response):
    #     # item ??????
    #     yield item
######-------??start_urls?LinkExtractor ???????--------#####