def parse(self , response):
origin_url = response.url
soup = BeautifulSoup(response.body,"lxml")
temp_soup = soup.find('div',id = "ess_ctr10789_ModuleContent") if soup.find('div',id = "ess_ctr10789_ModuleContent") else None
if temp_soup:
news_list = temp_soup.find_all("a" , href = re.compile("http://www.toptour.cn/tab"))
for news in news_list:
news_url = news.get("href")
title = news.text.strip()
item = NewsItem(
news_url = news_url,
title = title,
catalogue = u"???"
)
yield scrapy.Request(item["news_url"],callback=self.parse_news,meta={'item':item})
else:
logger.warning("%s can't find news_list " % origin_url)
评论列表
文章目录