def parse_newest(self, response):
soup = BeautifulSoup(response.body,"lxml")
page =response.request.body.split('=')[-1]
li = soup.find_all('li')
if li:
for news in li :
news_date = news.find(class_="time").string[2:] if news.find(class_="time") else None
struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M")
news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
title = news.find(class_="title").string if news.find(class_="title") else None
news_url = self.domain+news.find(class_="title").a.get("href",None) if news.find(class_="title") else None
abstract = news.find(class_="info").string if news.find(class_="info") else None
pic = self.domain+news.find('img').get('src',None) if news.find('img') else None
topic = news.find(class_="type").string if news.find(class_="type") else None
item = NewsItem(catalogue=u"????",
title=title,
news_url=news_url,
abstract=abstract,
pic=pic,
topic=topic,
news_date=news_date)
item = judge_news_crawl(item)
if item:
request = scrapy.Request(news_url,callback=self.parse_news,dont_filter=True)
request.meta["item"] = item
yield request
else:
self.flag=page
else:
logger.info("can't find news list")
#???
if not self.flag:
new_request = scrapy.FormRequest(self.start_url,formdata={'page':str(int(page)+1)},callback=self.parse_newest)
yield new_request
评论列表
文章目录