python类HtmlXPathSelector()的实例源码

quotes.py 文件源码 项目:base_function 作者: Rockyzsu 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def parse(self, response):
        #print response.url.split('/')
        #sel=HtmlXPathSelector(response)

        content=response.xpath('//div[@class="quote"]')
        for x in  content:
            word= x.xpath('.//span[@class="text"]/text()').extract_first()
            print '\n'
            print word
            yield {'text':word}

        nextPage=response.css('li.next a::attr(href)').extract_first()
        if  nextPage is not None:
            goNext=response.urljoin(nextPage)
            print "Go next: ",goNext
            yield scrapy.Request(url=goNext,callback=self.parse)
LinkedinSpider.py 文件源码 项目:spiders 作者: poodarchu 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def parse(self, response):
        """
        default parse method, rule is not useful now
        """
        # import pdb; pdb.set_trace()
        response = response.replace(url=HtmlParser.remove_url_parameter(response.url))
        hxs = HtmlXPathSelector(response)
        index_level = self.determine_level(response)
        log.msg("Parse: index level:" + str(index_level))
        if index_level in [1, 2, 3, 4]:
            self.save_to_file_system(index_level, response)
            relative_urls = self.get_follow_links(index_level, hxs)
            if relative_urls is not None:
                for url in relative_urls:
                    log.msg('yield process, url:' + url)
                    yield Request(url, callback=self.parse)
        elif index_level == 5:
            personProfile = HtmlParser.extract_person_profile(hxs)
            linkedin_id = self.get_linkedin_id(response.url)
            linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup
            if linkedin_id:
                personProfile['_id'] = linkedin_id
                personProfile['url'] = UnicodeDammit(response.url).markup
                yield personProfile
chou.py 文件源码 项目:Spider 作者: Ctrlsman 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def show(self, response):
        # print(response)
        hxs = HtmlXPathSelector(response)
        news_list = hxs.select('//div[@id="content-list"]/div[@class="item"]')
        for new in news_list:
            # temp = new.xpath('div/div[@class="part2"]/@share-linkid').extract()
            link_id = new.xpath('*/div[@class="part2"]/@share-linkid').extract_first()
            yield Request(
                url='http://dig.chouti.com/link/vote?linksId=%s' % (link_id,),
                method='POST',
                cookies=self.cookie_dict,
                callback=self.do_favor
            )
        page_list = hxs.select('//div[@id="dig_lcpage"]//a[re:test(@href, "/all/hot/recent/\d+")]/@href').extract()
        for page in page_list:

            page_url = 'http://dig.chouti.com%s' % page
            import hashlib
            hash = hashlib.md5()
            hash.update(bytes(page_url, encoding='utf-8'))
            key = hash.hexdigest()
            if key in self.has_request_set:
                pass
            else:
                self.has_request_set[key] = page_url
                yield Request(
                    url=page_url,
                    method='GET',
                    callback=self.show
                )
basic_authentication_spider.py 文件源码 项目:Hanhan_NLP 作者: hanhanwu 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def parse_responsepage(self, response):
        hxs = HtmlXPathSelector(response)
        yum = hxs.select('//span')
        print(yum)
Btc_Spiders.py 文件源码 项目:Python 作者: wanghaoying 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def parse(self,response):
        url=response.url
        #url='https://data.btcchina.com/data/ticker?market=all'

        #hxs=HtmlXPathSelector(response)sss
        hxs=json.loads(response.body_as_unicode())

        item=BTC.items.BtcItem()

        item['time']=self._get_sys_time()
        item['now']=hxs['ticker_btccny']['buy']
        item['height']=hxs['ticker_btccny']['high']
        item['low']=hxs['ticker_btccny']['low']
        yield item
        yield Request(url)
Spiders.py 文件源码 项目:Python 作者: wanghaoying 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def parse(self,response):
        self.log("fetch group home page: %s" % response.url)

        hxs=HtmlXPathSelector(response)


        item=douban_group.items.DoubanGroupItem()

        item['groupName']=hxs.select('//*[@id="group-info"]/h1/text()').re('^\s+(.*)\s+$')[0]
        item['groupUrl']=response.url

        group_id=self.__get_id_from_group_url(response.url)

        member_url='https://www.douban.com/group/%s/members' % group_id
        member_text=hxs.select('//a[contains(@href,"%s")]/text()' % member_url).re('(\d+)')

        item['totalNumber']=member_text[0]

        groups=hxs.select('//div[contains(@class,"group-list-item")]')

        for group in groups:
            url=group.select('div[contains(@class,"title")]/a/@href').extract()[0]
            yield Request(url)

        time.sleep(1)
        yield item
crawlerBlog.py 文件源码 项目:pydata_webscraping 作者: jmortega 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def parse_blog(self, response):
        print 'link parseado %s' %response.url
        hxs = HtmlXPathSelector(response)
        item = HackerWayItem()
        item['title'] = hxs.select('//title/text()').extract() # Selector XPath para el titulo
        item['author'] = hxs.select("//span[@class='author']/a/text()").extract() # Selector XPath para el author
        item['tag'] = hxs.select("//meta[@property='og:title']/text()").extract() # Selector XPath para el tag
        item['date'] = hxs.select("//span[@class='date']/text()").extract() # Selector XPath para la fecha
        return item # Retornando el Item.
base.py 文件源码 项目:ahmia-crawler 作者: ahmia 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def parse_item(self, response):
        """ Parse a response into a DocumentItem. """
        doc_loader = ItemLoader(item=DocumentItem(), response=response)
        doc_loader.add_value('url', response.url)
        doc_loader.add_xpath('meta', '//meta[@name=\'description\']/@content')
        doc_loader.add_value('domain', urlparse(response.url).hostname)
        doc_loader.add_xpath('title', '//title/text()')

        hxs = HtmlXPathSelector(response) # For HTML extractions

        # Extract links
        # For each link on this page
        links = []
        a_links = hxs.xpath('//a')
        for link in a_links:
            link_obj = {}
            # Extract the link's URL
            link_str = " ".join(link.xpath('@href').extract())
            link_obj['link'] = link_str.replace("\n", "")
            # Extract the links value
            link_name_str = " ".join(link.xpath('text()').extract())
            link_name_str = link_name_str.replace("\n", "")
            link_name_str = link_name_str.lstrip()
            link_name_str = link_name_str.rstrip()
            link_obj['link_name'] = link_name_str
            links.append(link_obj)
        doc_loader.add_value('links', links)

        # Populate text field
        title_list = hxs.xpath('//title/text()').extract()
        title = ' '.join(title_list)
        body_text = self.html2string(response)
        text = title + " " + body_text
        doc_loader.add_value('content', text)
        doc_loader.add_value('raw_text', text)

        doc_loader.add_value('raw_title', title)
        doc_loader.add_value('raw_url', response.url)

        h1_list = hxs.xpath("//h1/text()").extract()
        doc_loader.add_value('h1', " ".join(h1_list))

        doc_loader.add_value('content_type', response.headers['Content-type'])
        doc_loader.add_value('updated_on', datetime.datetime.now().strftime(
            "%Y-%m-%dT%H:%M:%S"))
        item = doc_loader.load_item()
        return item


问题


面经


文章

微信
公众号

扫码关注公众号