def generate_article_url(self, response):
as_id = ''.join(random.sample(string.ascii_letters + string.digits, 15))
cp_id = ''.join(random.sample(string.ascii_letters + string.digits, 15))
yield scrapy.Request(
"http://www.toutiao.com/api/pc/feed/?category=news_tech&utm_source=toutiao&widen=1&max_behot_time=0" +
"max_behot_time_tmp=" + str(int(time.time())) +
"tadrequire=true&as=" + as_id + "&cp=" + cp_id + "&t=" + str(time.time()),
callback=self.generate_article_url
)
article_list = json.loads(response.body)
if article_list.get("message") != "success":
return
for article_detail in article_list.get('data'):
# wenda gallery ad ?
# news_tech and news_finance
tag_url = article_detail.get('tag_url')
if article_detail.get('article_genre') == 'article'\
and (tag_url == 'news_tech' or tag_url == 'news_finance'):
yield scrapy.Request(
self.toutiao_url_pre + article_detail.get('source_url'),
callback=self.generate_article_content
)
评论列表
文章目录