def parse_news(self,response):
item = response.meta.get("item",NewsItem())
pageindex = response.meta.get("pageindex",1)
soup = BeautifulSoup(response.body, 'lxml')
origin_date = soup.find("td", class_="time").text.strip()
struct_date= datetime.datetime.strptime(origin_date,"%Y-%m-%d %H:%M")
news_date = struct_date.strftime("%Y-%m-%d %H:%M:%S")
content = soup.find("div", class_= "lph-article-comView").text.strip() if soup.find("div", class_= "lph-article-comView").text.strip() else None
item["news_date"]= news_date
item["crawl_date"]= NOW
item["content"] = content
item["catalogue"] = u"????"
item = judge_news_crawl(item)
if item:
yield item
else:
self.flag = int(pageindex)
评论列表
文章目录