meadin_spider.py 文件源码

python
阅读 23 收藏 0 点赞 0 评论 0

项目:NewsScrapy 作者: yinzishao 项目源码 文件源码
def parse_news(self,response):
        #content,news_date,news_no,crawl_date,referer_web
        item = response.meta.get("item",NewsItem())
        pageindex = response.meta.get("pageindex",1)
        soup = BeautifulSoup(response.body)
        # news_date = item.get("news_date",None)
        #?????????
        news_date = soup.find("span",class_="arial").text if soup.find("span",class_="arial") else None
        #http://info.meadin.com/PictureNews/2938_1.shtml Exception
        if news_date:

            # struct_date = datetime.datetime.strptime(news_date,"%Y-%m-%d %H:%M:%S")
            # delta = self.end_now-struct_date
            # if delta.days == self.end_day:
            #     raise CloseSpider('today scrapy end')
            referer_web = list(soup.find("p",class_="source").strings)[-1] if soup.find("p",class_="source") else None
            #????
            art,content = None,None
            art = soup.find("div",class_="article js-article")
            if art:
                #?????
                art.find("div",class_="intro").replace_with("")
                content =art.text.strip()
            news_no =response.url.split("/")[-1].split("_")[0]
            item["news_date"]=news_date
            item["content"]=content
            item["referer_web"]=referer_web
            item["crawl_date"]=NOW
            item["news_no"]=news_no
            item = judge_news_crawl(item)
            if item:
                yield item
            else:
                self.flag = pageindex
        else:
            logger.warning("can't find news_date.the url is %s" % response.url)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号