def get_record_list(self, response):
content = response.body
content = content.replace('<!--', '')
content = content.replace('-->', '')
tree = etree.HTML(content)
url_list = tree.xpath('//*[@id="thread_list"]//a/@href')
category = response.meta['category']
for i in url_list:
if '/p/' in i and 'http://' not in i:
tie_url = 'http://tieba.baidu.com' + i
yield scrapy.Request(
tie_url,
meta={"category": category},
callback=self.get_record_page_num
)
# check last reply time, ???????????? 12:12
rep_time = tree.xpath('//span[contains(@class,"threadlist_reply_date")]/text()')
if self.check_rep_date(rep_time[0]):
next_page = tree.xpath('//a[contains(@class, "next")]/text()')
if len(next_page) > 0:
logging.error(next_page[0])
page_key = int(response.meta['page_key']) + 50
url = 'http://tieba.baidu.com/f?ie=utf-8&kw=' + category + '&fr=search&pn=' + str(page_key)
yield scrapy.Request(
url,
meta={"page_key": page_key, "category": category},
callback=self.get_record_list
)
评论列表
文章目录