transport163.py 文件源码

python
阅读 25 收藏 0 点赞 0 评论 0

项目:NewsScrapy 作者: yinzishao 项目源码 文件源码
def parse(self, response):
        origin_url = response.url
        #http://money.163.com/special/002526O5/transport_02.html
        search_result = re.search(r"_(\d)*?\.",origin_url)
        #????
        pageindex = search_result.group(1) if search_result else 1
        soup = BeautifulSoup(response.body,"lxml")
        news_list = soup("div",class_="list_item clearfix")
        for news in news_list:
            news_date = news.find("span",class_="time").text if news.find("span",class_="time")else None
            title = news.find("h2").text if news.find("h2") else None
            news_url = news.find("h2").a.get("href",None) if news.find("h2") else None
            abstract = news.find("p").contents[0] if news.find("p") else None
            item = NewsItem(title=title,news_url=news_url,abstract=abstract,news_date=news_date)
            item = judge_news_crawl(item)   #??????????
            if item:
                request = scrapy.Request(news_url,callback=self.parse_news,meta={"item":item})
                yield request
            else:
                self.flag = int(pageindex)
        if not self.flag:
            next_url = self.next_url % int(pageindex)+1
            yield scrapy.Request(next_url)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号