python类selector()的实例源码

virtualvocations.py 文件源码 项目:remotor 作者: jamiebull1 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def parse(self, response):
        """Get the pagination links and hand them off.
        """
        s = Selector(response)
        pagination = s.css('.pagination')
        pagelinks = [response.url]
        pagelinks.extend(pagination.xpath(
            '//a[contains(@href, "l-remote/p-")]/@href').extract())
#        for pagelink in pagelinks:
        for pagelink in pagelinks[:1]:
            request = Request(
                urljoin(self.root, pagelink),
                callback=self.parse_jobspage,
                dont_filter=True,
                )
            yield request
careerbuilder.py 文件源码 项目:remotor 作者: jamiebull1 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def parse_job(self, response):
        """Parse a joblink into a JobItem.
        """
        s = Selector(response)
        item = JobItem()
        item['url'] = response.url.split('?')[0]
        item['site'] = 'CareerBuilder'
        item['title'] = s.css('h1::text').extract_first()
        item['text'] = s.css('.job-facts::text').extract()
        item['text'].extend(s.css('.item').css('.tag::text').extract())
        item['text'].extend(s.css('.description::text').extract())
        try:
            posted = s.xpath(
                '//h3[@id="job-begin-date"]/text()').extract_first()
            item['date_posted'] = utilities.naturaltime(
                posted.replace('Posted ', '')).isoformat()
        except Exception as e:
            self.logger.error(e)
        yield item
pydataSpiderDetails.py 文件源码 项目:pydata_webscraping 作者: jmortega 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def parse(self, response):
        hxs = scrapy.Selector(response)
        slots_tutorials = hxs.xpath('//td[@class="slot slot-tutorial"]')
        for slot in slots_tutorials:
            speakers_tutorials = slot.xpath('//span[@class="speaker"]/text()').extract()
            urls_tutorials = slot.xpath('//span[@class="title"]//@href').extract()
            talks_tutorials = slot.xpath('//span[@class="title"]//a/text()').extract()

        indexSpeaker=0
        for speaker in speakers_tutorials:
            yield Request(url=''.join(('http://www.pydata.org', urls_tutorials[indexSpeaker])),
                          callback=self.parse_details,
                          meta={'speaker': speaker.strip(), 'url': urls_tutorials[indexSpeaker], 
                          'talk': talks_tutorials[indexSpeaker]}
                          )       
            indexSpeaker=indexSpeaker+1
ip181.py 文件源码 项目:rental 作者: meihuanyu 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse_page(self, response):
        self.write(response.body)

        sel = Selector(response)
        infos = sel.xpath('//tbody/tr').extract()
        for i, info in enumerate(infos):
            if i == 0:
                continue

            val = Selector(text = info)
            ip = val.xpath('//td[1]/text()').extract_first()
            port = val.xpath('//td[2]/text()').extract_first()
            country = val.xpath('//td[6]/text()').extract_first()
            anonymity = val.xpath('//td[3]/text()').extract_first()
            https = val.xpath('//td[4]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy)
scrapy_xinhua.py 文件源码 项目:web_crawler 作者: NearXdu 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def parse(self, response):
        def getdomain(url):
            proto, rest = urllib.splittype(url)
            host, rest = urllib.splithost(rest)
            return "http://"+host
        sel=scrapy.Selector(response)
        links_in_a_page=sel.xpath('//a[@href]')

        for link_sel in links_in_a_page:
            item=XinhuaItem()
            link=str(link_sel.re('href="(.*?)"')[0])


            if link:
                if not link.startswith('http'):
                    link=response.url+link
                    #link=getdomain(response.url)+link



                yield scrapy.Request(link,callback=self.parse)

                p1=re.compile(r'.*\d{4}-\d{2}/\d{2}.*')
                if re.match(p1,link):
                    print ("Y: "+link)
                    item['link']=link
                    yield item
                else:
                    print ("F: "+link)
scrapy_qq.py 文件源码 项目:web_crawler 作者: NearXdu 项目源码 文件源码 阅读 56 收藏 0 点赞 0 评论 0
def parse(self, response):
        def getdomain(url):
            proto, rest = urllib.splittype(url)
            host, rest = urllib.splithost(rest)
            return "http://"+host

        sel=scrapy.Selector(response)
        links_in_a_page = sel.xpath('//a[@href]')

        for link_sel in links_in_a_page:
            item=QqurlItem()
            link=str(link_sel.re('href="(.*?)"')[0])

            if link:
                if not link.startswith('http'):
                    if link.startswith('javascript'):
                        continue
                    if link.startswith('//support'):
                        continue
                    link=getdomain(response.url)+link


                if  re.match('.*comment.*',link):
                    continue


                yield scrapy.Request(link,callback=self.parse)
                if not re.match('.*comment.*',link):
                    if re.match('^http.*qq.com.*\.s?html?$',link):
                        item['link']=link
                        yield item
scrapy_sohu.py 文件源码 项目:web_crawler 作者: NearXdu 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def parse(self, response):
        def getdomain(url):
            #proto,rest=urllib.splittype(url)
            #host,rest=urllib.splithost(rest)
            return "http:"

        sel =  scrapy.Selector(response)
        links_in_a_page=sel.xpath('//a[@href]')

        for link_sel in links_in_a_page:
            item=SohuItem()
            link=str(link_sel.re('href="(.*?)"')[0])

            if link:
                if not link.startswith('http'):
                    link=getdomain(response.url)+link

                yield scrapy.Request(link,callback=self.parse)

                p1=re.compile(r'.*/a/.*')
                p2=re.compile(r'.*#comment_area$')
                p3=re.compile(r'.*news.sohu.com.*s?html?$')



                if (re.match(p3,link) or re.match(p1,link)) and (not re.match(p2,link)):
                    #print ('T: '+link)
                    item['link']=link
                    yield item
                else:
                    pass
                    #print ('F: '+link)
spider_7_quotes_js2xml.py 文件源码 项目:scrapy-training 作者: scrapinghub 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def alternative_parse_method(self, response):
        # An alternative would be to build a Scrapy selector from the JS string
        # and extract the data using CSS selectors
        script = response.xpath('//script[contains(., "var data =")]/text()').extract_first()
        sel = scrapy.Selector(root=js2xml.parse(script))
        for quote in sel.css('var[name="data"] > array > object'):
            yield {
                'text': quote.css('property[name="text"] > string::text').extract_first(),
                'author': quote.css('property[name="author"] property[name="name"] > string::text').extract_first(),
                'tags': quote.css('property[name="tags"] string::text').extract(),
            }

        link_next = response.css('li.next a::attr("href")').extract_first()
        if link_next:
            yield scrapy.Request(response.urljoin(link_next))
spider_3_quotes_selenium.py 文件源码 项目:scrapy-training 作者: scrapinghub 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def parse(self, response):
        self.driver.get(response.url)
        sel = scrapy.Selector(text=self.driver.page_source)
        for quote in sel.css('div.quote'):
            yield {
                'text': quote.css('span.text::text').extract_first(),
                'author': quote.css('span small::text').extract_first(),
                'tags': quote.css('div.tags a.tag::text').extract(),
            }
        next_page = sel.css('li.next > a::attr(href)').extract_first()
        if next_page:
            yield scrapy.Request(response.urljoin(next_page))
Meizitu.py 文件源码 项目:Scrapy_CrawlMeiziTu 作者: williamzxl 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def parse(self, response):
        selector = scrapy.Selector(response)
        #item = CrawlmeizituItemPage()

        next_pages = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/@href').extract()
        next_pages_text = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/text()').extract()
        all_urls = []
        if '???' in next_pages_text:
            next_url = "http://www.meizitu.com/a/{}".format(next_pages[-2])
            with open('..//url.txt', 'a+') as fp:
                fp.write('\n')
                fp.write(next_url)
                fp.write("\n")
            request = scrapy.http.Request(next_url, callback=self.parse)
            time.sleep(2)
            yield request

        all_info = selector.xpath('//h3[@class="tit"]/a')
        #??????????
        for info in all_info:
            links = info.xpath('//h3[@class="tit"]/a/@href').extract()
        for link in links:
            request = scrapy.http.Request(link, callback=self.parse_item)
            time.sleep(1)
            yield request

        # next_link = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/@href').extract()
        # next_link_text = selector.xpath('//*[@id="wp_page_numbers"]/ul/li/a/text()').extract()
        # if '???' in next_link_text:
        #     nextPage = "http://www.meizitu.com/a/{}".format(next_link[-2])
        #     item['page_url'] = nextPage
        #     yield item

            #??????????
crawldetails.py 文件源码 项目:crawllagou 作者: ScarecrowFu 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def parse_detail(self,response):
        item = CrawldetailsItem()
        sel = Selector(response)

        try:
            item["kd"] = response.meta['kd']
            item["title"] = self.get_text(sel,'//*[@id="job_detail"]/dt/h1/@title')
            item["company"] = sel.xpath('//*[@id="container"]/div[2]/dl/dt/a/div/h2/text()').extract()[0].strip()
            item["city"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[2]/text()').extract()[0]
            item["address"] = sel.xpath('//*[@id="container"]/div[2]/dl/dd/div[1]/text()').extract()[0]
            industry = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[1]/li[1]').extract()[0]
            item["industry"] = BeautifulSoup(industry).get_text().encode("utf-8").split(' ')[1].strip()
            scale = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[1]/li[2]').extract()[0]
            item["scale"] = BeautifulSoup(scale).get_text().encode("utf-8").split(' ')[1].strip()
            phase = sel.xpath('//*[@id="container"]/div[2]/dl/dd/ul[2]/li').extract()[0]
            item["phase"] = BeautifulSoup(phase).get_text().encode("utf-8").split(' ')[1].strip()
            item["salary"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[1]/text()').extract()[0]
            item["experience"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[3]/text()').extract()[0]
            item["education"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[1]/span[4]/text()').extract()[0]
            item["description"] = self.get_text(sel,'//*[@id="job_detail"]/dd[2]')
            item["url"] = response.url
            item["published"] = sel.xpath('//*[@id="job_detail"]/dd[1]/p[3]/text()').extract()[0][:-8]
            item["tag"] = self.get_text(sel, '//*[@id="job_detail"]/dd[1]/p[2]/text()')


        except Exception, e:
            print e
        yield item
fas_ui_profile.py 文件源码 项目:directory-tests 作者: uktrade 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def get_case_studies_details(response: Response):
    content = response.content.decode("utf-8")
    article_selector = "#company-projects > article"
    articles = Selector(text=content).css(article_selector).extract()
    result = []
    for article in articles:
        title = Selector(text=article).css("h3::text").extract()[0]
        summary = Selector(text=article).css("p::text").extract()[0]
        href = Selector(text=article).css("a::attr(href)").extract()[0]
        slug = href.split("/")[-2]
        assert slug, "Could not extract case study slug from {}".format(article)
        logging.debug("Got case study slug: %s", slug)
        result.append((title, summary, href, slug))
    assert result, "No Case Study details extracted from {}".format(articles)
    return result
fab_when_impl.py 文件源码 项目:directory-tests 作者: uktrade 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def fas_get_company_profile_url(response: Response, name: str) -> str:
    content = response.content.decode("utf-8")
    links_to_profiles_selector = "#ed-search-list-container a"
    href_selector = "a::attr(href)"
    links_to_profiles = Selector(text=content).css(
        links_to_profiles_selector).extract()
    profile_url = None
    for link in links_to_profiles:
        if escape_html(name).lower() in escape_html(link).lower():
            profile_url = Selector(text=link).css(href_selector).extract()[0]
    with assertion_msg(
            "Couldn't find link to '%s' company profile page in the response",
            name):
        assert profile_url
    return profile_url
fab_when_impl.py 文件源码 项目:directory-tests 作者: uktrade 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def fas_follow_case_study_links_to_related_sectors(context, actor_alias):
    actor = context.get_actor(actor_alias)
    session = actor.session
    content = context.response.content.decode("utf-8")
    links_css_selector = "#company-showcase .case-study-info a"
    links_to_sectors = Selector(text=content).css(links_css_selector).extract()
    with assertion_msg("Expected to find at least 1 link to Industry sector"
                       "associated with Company Showcase Case Study"):
        assert links_css_selector
    results = {}
    fas_url = get_absolute_url("ui-supplier:landing")
    for link in links_to_sectors:
        industry = Selector(text=link).css("a::text").extract()[0]
        href = Selector(text=link).css("a::attr(href)").extract()[0]
        url = urljoin(fas_url, href)
        sectors = [value for _, value in parse_qsl(urlsplit(href).query)]
        logging.debug(
            "%s will look for Suppliers in '%s' Industry sectors '%s'",
            actor_alias, industry, ", ".join(sectors)
        )
        response = make_request(Method.GET, url=url, session=session)
        results[industry] = {
            "url": url,
            "sectors": sectors,
            "response": response
        }
    context.results = results
fab_then_impl.py 文件源码 项目:directory-tests 作者: uktrade 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def fas_should_see_unfiltered_search_results(context, actor_alias):
    response = context.response
    content = response.content.decode("utf-8")
    sector_filters_selector = "#id_sectors input"
    filters = Selector(text=content).css(sector_filters_selector).extract()
    for fil in filters:
        sector = Selector(text=fil).css("input::attr(value)").extract()[0]
        selector = "input::attr(checked)"
        checked = True if Selector(text=fil).css(selector).extract() else False
        with assertion_msg(
                "Expected search results to be unfiltered but this "
                "filter was checked: '%s'", sector):
            assert not checked
    logging.debug("%s was shown with unfiltered search results", actor_alias)
dzdp_spider.py 文件源码 项目:web-crawler-spider- 作者: Hardysong 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def parse_location(self,response):

        loc_hxs = scrapy.Selector(response)
        loc_xs = loc_hxs.xpath('//div[@id="aside"]/script[1]').extract()[0]
        coord_text = re.findall(r'lng:\w+.\w+,lat:\w+.\w+',loc_xs)[0]

        item = response.meta['item']
        item['location'] = coord_text.encode('gbk')
        return item
        #print  coord_text
lianjia_spider.py 文件源码 项目:web-crawler-spider- 作者: Hardysong 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def parse(self,response):
        reload(sys)
        sys.setdefaultencoding('utf8')

        print '__________'
        if response.status == 403:
            print 'meet 403, sleep 600 sconds'
            import time
            time.sleep(1200)
            yield Request(response.url,callback=self.parse)
        #404,????????????
        elif response.status == 404:
            print 'meet 404,return'
        else:

            hxs = scrapy.Selector(response)

            for i in range(1,31):
                item = SoufangItem()


                name_ = hxs.xpath('/html/body/div[4]/div[1]/ul/li['+str(i)+']/div[1]/div[1]/a/text()').extract()
                name = ''.join(name_)

                http = hxs.xpath('/html/body/div[4]/div[1]/ul/li['+str(i)+']/div[1]/div[1]/a/@href').extract()
                href = ''.join(http)
                #href = href + 'xiangqing/'

                item['name'] = name.encode('gbk')

                item['link'] = href.encode('gbk')

                yield Request(href,callback=self.parse_detail,meta={'item':item})

                print name, href
            print '__________'
lianjia_spider.py 文件源码 项目:web-crawler-spider- 作者: Hardysong 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def parse_detail(self,response):
        #print 'in'

        loc_hxs = scrapy.Selector(response)
        loudongzongshu = loc_hxs.xpath('/html/body/div[5]/div[2]/div[2]/div[5]/span[2]/text()').extract()
        loudongzongshu = ''.join(loudongzongshu)

        fangwuzongshu = loc_hxs.xpath('/html/body/div[5]/div[2]/div[2]/div[6]/span[2]/text()').extract()
        fangwuzongshu = ''.join(fangwuzongshu)

        item = response.meta['item']
        item['address'] = loudongzongshu.encode('gbk')
        item['zonghushu'] = fangwuzongshu.encode('gbk')

        return item
soufang_spider.py 文件源码 项目:web-crawler-spider- 作者: Hardysong 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def parse_detail(self,response):

        loc_hxs = scrapy.Selector(response)
        build_num_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[2]/text()').extract()
        build_num = ''.join(build_num_)

        total_households_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[4]/text()').extract()
        total_households = ''.join(total_households_)

        plot_ratio_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[6]/text()').extract()
        plot_ratio = ''.join(plot_ratio_)

        green_ratio_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[8]/text()').extract()
        green_ratio = ''.join(green_ratio_)

        property_fee_ = loc_hxs.xpath('/html/body/div[3]/div[4]/div[1]/div[2]/div[2]/ul/li[10]/text()').extract()
        property_fee = ''.join(property_fee_)

        item = response.meta['item']
        item['build_num'] = build_num.encode('gbk')
        item['total_households'] = total_households.encode('gbk')
        item['plot_ratio'] = plot_ratio.encode('gbk')
        item['greening_ratio'] = green_ratio.encode('gbk')
        item['properity_fee'] = property_fee.encode('gbk')

        return item
test_design_topic_spider.py 文件源码 项目:decoration-design-crawler 作者: imflyn 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def test_parse_content(self):
        content = requests.get('http://xiaoguotu.to8to.com/topic/11.html')
        response = Response('http://xiaoguotu.to8to.com/topic/11.html')
        response.text = content.content.decode("utf-8")
        selector = Selector(response)
        title = selector.xpath('//div[@class="xdb_title"]/h1/text()').extract()[0]
        description = selector.xpath('//div[@class="xdbc_description"]//div//p/text()').extract()[0]
        items_selector = selector.xpath('//div[@class="xdbc_main_content"]//p')
        article = []
        text = ''
        for index, item_selector in enumerate(items_selector):
            try:
                text = item_selector.xpath('span/text()').extract()[0]
            except IndexError:
                try:
                    img_url = item_selector.xpath('img/@src').extract()[0]
                    img_width = 0
                    try:
                        img_width = item_selector.xpath('img/@width').extract()[0]
                    except IndexError:
                        pass
                    img_height = 0
                    try:
                        img_height = item_selector.xpath('img/@height').extract()[0]
                    except IndexError:
                        pass
                    article.append({'content': text, 'img_url': img_url, 'img_width': img_width, 'img_height': img_height})
                except IndexError:
                    continue
        design_topic_item = DesignTopicItem()
        design_topic_item['title'] = title
        design_topic_item['description'] = description
        design_topic_item['article'] = article
        design_topic_item['html_url'] = response.url
        return design_topic_item
xicispiders.py 文件源码 项目:scrapyweixi 作者: Felix-P-Code 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def parse(self, response):
        sel = scrapy.Selector(response)
        #print(sel.xpath('//title').extract())
        fligint_div = "//ul[@class='news-list2']/li[1]/div[@class='gzh-box2']/div[@class='img-box']/a[1]/@href"
        first_url_list = sel.xpath(fligint_div).extract()
        self.first_url = first_url_list[0]
        print(self.first_url)
        yield  scrapy.Request(self.first_url,meta=self.meta, callback=self.parse_url_list)


问题


面经


文章

微信
公众号

扫码关注公众号