class_central_spider.py 文件源码

python
阅读 32 收藏 0 点赞 0 评论 0

项目:CourseWebCrawler 作者: BitTigerInst 项目源码 文件源码
def parse(self, response):
        # Get help from:  http://stackoverflow.com/questions/38574869/how-can-i-jump-to-next-page-in-scrapy
        if response.meta.get('is_json', False):
            page = Selector(text=json.loads(response.body)['table'])
        else:
            page = Selector(response) 

        if self.flag:
            self.total_item_num = int(page.xpath('//div[@id="show-more-courses"]/text()').re(r'courses of (.*)')[0]) + 50
            print "Total courses: ", self.total_item_num
            self.steps = self.total_item_num / 50 + 1
            self.flag = False

        base_urls = "https://www.class-central.com/courses/past"
        #base_urls = "https://www.class-central.com/courses/recentlyAdded"
        my_header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}

        divs = page.xpath('//tr[@itemtype="http://schema.org/Event"]')
        #print "print content", len(divs) 
        print "Process: ", self.cnt, '/', self.steps

        for div in divs:
            item = MoocCrawlerItem()
            item = {k:"" for k in item.keys()}

            parse_name = div.xpath('./td/a/span[@class="course-name-text"]/text()').extract_first().strip()
            item['name'] = parse_name
            parse_score = div.xpath('./td/div[@class="course-rating-value"]/text()').extract_first().strip()
            if len(parse_score) > 3:
                parse_score = parse_score[:3]
            item['score'] = string.atof(parse_score) * 2
            parse_platform = div.xpath('./td/div[@class="course-provider"]/text()').extract_first().strip()
            item['platform'] = parse_platform
            parse_url = div.xpath('./td/a/@href').extract_first().decode().encode('utf-8').strip()
            item['url'] = "https://www.class-central.com" + parse_url
            parse_cid = re.findall(r'/mooc/(.*)/', parse_url)[0]
            item['cid'] = "cc" + parse_cid

            req = scrapy.Request(item['url'], headers=my_header, callback=self.parse_detail_page)
            req.meta['item'] = item   

            yield req

        #next_page_el = respones.xpath("//div[@id='show-more-courses']")

        if self.cnt < self.steps:
        #if next_page_el:
            next_page_url = "https://www.class-central.com/maestro/courses/past?page=1&_=1471346096733"
            #next_page_url = "https://www.class-central.com/maestro/courses/recentlyAdded?page=1"
            next_page = response.meta.get('page', 1) + 1
            next_page_url = add_or_replace_parameter(next_page_url, 'page', next_page)
            r = scrapy.Request(next_page_url, headers=my_header, callback=self.parse, meta={'page': next_page, 'is_json': True})
            self.cnt += 1
            yield r
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号