def parse(self, response):
# Get help from: http://stackoverflow.com/questions/38574869/how-can-i-jump-to-next-page-in-scrapy
if response.meta.get('is_json', False):
page = Selector(text=json.loads(response.body)['table'])
else:
page = Selector(response)
if self.flag:
self.total_item_num = int(page.xpath('//div[@id="show-more-courses"]/text()').re(r'courses of (.*)')[0]) + 50
print "Total courses: ", self.total_item_num
self.steps = self.total_item_num / 50 + 1
self.flag = False
base_urls = "https://www.class-central.com/courses/past"
#base_urls = "https://www.class-central.com/courses/recentlyAdded"
my_header = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'}
divs = page.xpath('//tr[@itemtype="http://schema.org/Event"]')
#print "print content", len(divs)
print "Process: ", self.cnt, '/', self.steps
for div in divs:
item = MoocCrawlerItem()
item = {k:"" for k in item.keys()}
parse_name = div.xpath('./td/a/span[@class="course-name-text"]/text()').extract_first().strip()
item['name'] = parse_name
parse_score = div.xpath('./td/div[@class="course-rating-value"]/text()').extract_first().strip()
if len(parse_score) > 3:
parse_score = parse_score[:3]
item['score'] = string.atof(parse_score) * 2
parse_platform = div.xpath('./td/div[@class="course-provider"]/text()').extract_first().strip()
item['platform'] = parse_platform
parse_url = div.xpath('./td/a/@href').extract_first().decode().encode('utf-8').strip()
item['url'] = "https://www.class-central.com" + parse_url
parse_cid = re.findall(r'/mooc/(.*)/', parse_url)[0]
item['cid'] = "cc" + parse_cid
req = scrapy.Request(item['url'], headers=my_header, callback=self.parse_detail_page)
req.meta['item'] = item
yield req
#next_page_el = respones.xpath("//div[@id='show-more-courses']")
if self.cnt < self.steps:
#if next_page_el:
next_page_url = "https://www.class-central.com/maestro/courses/past?page=1&_=1471346096733"
#next_page_url = "https://www.class-central.com/maestro/courses/recentlyAdded?page=1"
next_page = response.meta.get('page', 1) + 1
next_page_url = add_or_replace_parameter(next_page_url, 'page', next_page)
r = scrapy.Request(next_page_url, headers=my_header, callback=self.parse, meta={'page': next_page, 'is_json': True})
self.cnt += 1
yield r
class_central_spider.py 文件源码
python
阅读 32
收藏 0
点赞 0
评论 0
评论列表
文章目录