def generate_forum(self, response):
forum_list = response.xpath('//td[@class="fl_g"]//dl//dt//a/@href').extract()
if len(forum_list) > 0:
for forum_url in forum_list:
f_url = forum_url
if forum_url.find('bbs.zhiyoo.com') == -1:
f_url = 'http://bbs.zhiyoo.com/' + forum_url
yield scrapy.Request(
f_url,
meta={"page_key": 1, "proxy": MongoClient.get_random_proxy()},
callback=self.generate_forum
)
# check ??????
pg_bar = response.xpath('//div[@class="pg"]//a[@class="nxt"]/@href').extract()
page_key = int(response.meta['page_key'])
rep_time_list = response.xpath('//tr/td[@class="by"]/em/a').extract()
# ???????????
if len(pg_bar) > 0:
if page_key == 1 or self.check_rep_date(rep_time_list):
yield scrapy.Request(
pg_bar[0],
meta={"page_key": -1, "proxy": MongoClient.get_random_proxy()},
callback=self.generate_forum
)
# scrapy all tie url
thread_list = response.xpath('//a[@class="xst"]/@href').extract()
if len(thread_list) > 0:
for thread_url in thread_list:
yield scrapy.Request(
thread_url,
meta={"proxy": MongoClient.get_random_proxy()},
callback=self.generate_forum_thread
)
评论列表
文章目录