def parse(self, response):
origin_url = response.url
#http://money.163.com/special/002526O5/transport_02.html
search_result = re.search(r"_(\d)*?\.",origin_url)
#????
pageindex = search_result.group(1) if search_result else 1
soup = BeautifulSoup(response.body,"lxml")
news_list = soup("div",class_="list_item clearfix")
for news in news_list:
news_date = news.find("span",class_="time").text if news.find("span",class_="time")else None
title = news.find("h2").text if news.find("h2") else None
news_url = news.find("h2").a.get("href",None) if news.find("h2") else None
abstract = news.find("p").contents[0] if news.find("p") else None
item = NewsItem(title=title,news_url=news_url,abstract=abstract,news_date=news_date)
item = judge_news_crawl(item) #??????????
if item:
request = scrapy.Request(news_url,callback=self.parse_news,meta={"item":item})
yield request
else:
self.flag = int(pageindex)
if not self.flag:
next_url = self.next_url % int(pageindex)+1
yield scrapy.Request(next_url)
评论列表
文章目录