tech_ifeng_spider.py 文件源码-python代码片段

tech_ifeng_spider.py 文件源码

python

阅读 21 收藏 0 点赞 0 评论 0

项目：NewsScrapy 作者: yinzishao 项目源码文件源码

def parse_news(self, response):
        item = response.meta.get("item", NewsItem())
        soup = BeautifulSoup(response.body.decode("utf-8").encode("utf-8"),"lxml")
        pic = soup.find("p",class_ = "detailPic").find("img").get("src") if soup.find("p",class_ = "detailPic") else None
        referer_web = soup.find("span",class_ = "ss03").text if soup.find("span",class_ = "ss03") else None
        author = soup.find("span",itemprop="author").find("span").text if soup.find("span",itemprop="author") else None
        temp = soup.find("div" ,id = "main_content")
        if temp:
            ps = temp.find_all("p") if temp.find_all("p") else None
            content = "\n\n".join([ p.text.strip() for p in ps])
        else:
            content = None
        item['pic'] = pic
        item['referer_web'] = referer_web
        item['author'] = author
        item['content'] = content
        item['crawl_date'] = NOW
        yield item