def parse(self,response):
origin_url = response.url
if "index" not in origin_url:
soup = BeautifulSoup(response.body,"lxml")
catalogue = soup.find("a",class_ = "blue CurrChnlCls").get("title").strip()
news_list = soup.find("div", class_ = "lie_main_m").find_all("li")
for news in news_list:
title = news.find("a").text.strip()
news_url = "http://www.cnta.gov.cn/xxfb" + news.find("a").get("href")[2:]
news_no = news_url.rsplit("/",1)[-1].split(".")[0]
item = NewsItem(
news_url =news_url,
title = title,
news_no = news_no,
catalogue = catalogue,
)
yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item})
else:
topic_url = origin_url.rsplit(".",1)[0]
self.flag.setdefault(topic_url,0)
yield scrapy.Request(origin_url,callback=self.parse_topic)
评论列表
文章目录