python类Selector()的实例源码

BookSpider.py 文件源码 项目:crawl_web 作者: hanxlinsist 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def parse_book(self, response):
        item = BookItem()
        sel = Selector(response)
        e = sel.xpath("//div[@id='wrapper']")
        item['name'] = e.xpath("./descendant::h1/descendant::span/text()").extract()
        item['author'] = e.xpath("//*[@id='info']/span[1]/a/text()").extract()
        item['bookinfo'] = e.xpath("//*[@id='info']/text()").extract()
        item['score'] = e.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract()
        item['commentNum'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@property = "v:votes"]/text()').extract()

        item['fivestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][1]/text()').extract()
        item['fourstar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][2]/text()').extract()
        item['threestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][3]/text()').extract()
        item['twostar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][4]/text()').extract()
        item['onestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][5]/text()').extract()

        item['tag'] = response.xpath("//*[@id = 'db-tags-section']/descendant::a/text()").extract()

        request = scrapy.Request(response.url + "/comments/hot", callback=self.parse_review) # ???????????
        request.meta['item'] = item

        return request


    # ???????????
MovieSpider.py 文件源码 项目:crawl_web 作者: hanxlinsist 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def parse_item(self, response):
        item = BookItem()
        sel = Selector(response)
        e = sel.xpath("//div[@id='wrapper']")
        item['name'] = e.xpath("./descendant::h1/descendant::span/text()").extract()
        item['author'] = e.xpath("//*[@id='info']/span[1]/a/text()").extract()
        item['bookinfo'] = e.xpath("//*[@id='info']/text()").extract()


        item['score'] = e.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract()
        item['commentNum'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@property = "v:votes"]/text()').extract()
        item['fivestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][1]/text()').extract()
        item['fourstar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][2]/text()').extract()
        item['threestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][3]/text()').extract()
        item['twostar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][4]/text()').extract()
        item['onestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][5]/text()').extract()

        return item
TestSpider.py 文件源码 项目:crawl_web 作者: hanxlinsist 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def parse(self, response):
        item = BookItem()
        sel = Selector(response)
        e = sel.xpath("//div[@id='wrapper']")
        item['name'] = e.xpath("./descendant::h1/descendant::span/text()").extract()
        item['author'] = e.xpath("//*[@id='info']/span[1]/a/text()").extract()
        item['bookinfo'] = e.xpath("//*[@id='info']/text()").extract()
        item['score'] = e.xpath('//*[@id="interest_sectl"]/div/div[2]/strong/text()').extract()
        item['commentNum'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@property = "v:votes"]/text()').extract()

        item['fivestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][1]/text()').extract()
        item['fourstar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][2]/text()').extract()
        item['threestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][3]/text()').extract()
        item['twostar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][4]/text()').extract()
        item['onestar'] = e.xpath('//*[@id="interest_sectl"]/descendant::span[@class = "rating_per"][5]/text()').extract()

        item['tag'] = response.xpath("//*[@id = 'db-tags-section']/descendant::a/text()").extract()

        request = scrapy.Request(response.url + "/comments/hot", callback=self.parse_review) # ???????????
        request.meta['item'] = item

        return request


    # ???????????
initiatives.py 文件源码 项目:tipi-engine 作者: CIECODE-Madrid 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def parse(self, response):

        list_types = Selector(response).xpath('//div[@class="listado_1"]//ul/li/a')
        for types in list_types:
            href=  types.xpath("./@href").extract()
            text = types.xpath("./text()").extract()
            if Terms.filterBytype(text[0]):
                type = Terms.getType(text[0])
                initiative_url = Utils.createUrl(response.url,href[0])
                yield scrapy.Request(initiative_url,errback=self.errback_httpbin,callback=self.initiatives, meta={'type': type})
        """
        urlsa = ""
        urlsa = "http://www.congreso.es/portal/page/portal/Congreso/Congreso/Iniciativas/Indice%20de%20Iniciativas?_piref73_1335503_73_1335500_1335500.next_page=/wc/servidorCGI&CMD=VERLST&BASE=IW12&PIECE=IWC2&FMT=INITXD1S.fmt&FORM1=INITXLUS.fmt&DOCS=100-100&QUERY=%28I%29.ACIN1.+%26+%28161%29.SINI."


        yield scrapy.Request(urlsa, errback=self.errback_httpbin, callback=self.oneinitiative,
                             meta={'type': u"Proposición no de Ley en Comisión"})
        """
initiatives.py 文件源码 项目:tipi-engine 作者: CIECODE-Madrid 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def initiatives(self, response):
        type = response.meta['type']
        first_url = Selector(response).xpath('//div[@class="resultados_encontrados"]/p/a/@href').extract()[0]
        num_inis = Selector(response).xpath('//div[@class="SUBTITULO_CONTENIDO"]/span/text()').extract()
        split = first_url.partition("&DOCS=1-1")
        for i in range(1,int(num_inis[0])+1):
            new_url = split[0]+"&DOCS="+str(i)+"-"+str(i)+split[2]
            initiative_url = Utils.createUrl(response.url,new_url)
            CheckItems.addElement(initiative_url)

            if Blacklist.getElement(initiative_url):
                if not Blacklist.getElement(initiative_url):
                    yield scrapy.Request(initiative_url,errback=self.errback_httpbin,
                                         callback=self.oneinitiative, meta = {'type':type})
            else:
                yield scrapy.Request(initiative_url,errback=self.errback_httpbin,
                                     callback=self.oneinitiative, meta = {'type':type})
dice.py 文件源码 项目:job_scraper 作者: wlabatey 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def parse_items(self, response):
        hxs = Selector(response)
        jobs = hxs.xpath('//div[contains(@class, "searchResultTitle")]')
        items = []
        for job in jobs:
            item = Job()
            item["title"] = job.xpath('.//h2/a[contains(@id, "TITLE")]/text()').extract()[0].strip()
            company = job.xpath('.//p/span[contains(@id, "CONTACT_OFFICE")]/text()').extract()
            item["company"] = company[0].strip() if company else "n/a"
            item["location"] = job.xpath('.//p/span[contains(@id, "FREE_LOCATION")]/text()').extract()[0].strip()
            item["url"] = job.xpath('.//h2/a[contains(@id, "TITLE")]/@href').extract()[0]
            item["date_posted"] = job.xpath('.//p/span[contains(@id, "POSTED_DATE")]/text()').extract()[0].strip()
            salary = job.xpath('.//p/span[contains(@id, "SALARY")]/text()').extract()
            item["salary"] = salary[0].strip() if salary else "n/a"
            item["crawl_timestamp"] = datetime.now().strftime("%H:%M:%S %Y-%m-%d") 
            item["job_board"] = "dice"
            items.append(item)
        return items
stackoverflow.py 文件源码 项目:job_scraper 作者: wlabatey 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def parse(self, response):
        hxs = Selector(response)
        jobs = hxs.xpath('//div[contains(@class, "-job-item")]')
        items = []
        for job in jobs:
            item = Job()
            item["title"] = job.xpath('.//a[@class="job-link"]/text()').extract()[0]
            item["company"] = job.xpath('.//div[@class="-name"]/text()').extract()[0].strip()
            item["location"] = re.sub(r'\W+', '', job.xpath('.//div[@class="-location"]/text()').extract()[0].strip())
            item["url"] = job.xpath('.//a[@class="job-link"]/@href').extract()[0]
            item["date_posted"] = job.xpath('.//p[contains(@class, "-posted-date")]/text()').extract()[0].strip()
            item["salary"] = job.xpath('.//span[@class="-salary"]/text()').extract_first(default='n/a').strip()
            item["tags"] = job.css('.-tags p a.post-tag::text').extract()
            item["crawl_timestamp"] = datetime.now().strftime("%H:%M:%S %Y-%m-%d") 
            item["job_board"] = "stackOverflow"
            items.append(item)
        return items
xicidaili.py 文件源码 项目:rental 作者: meihuanyu 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def parse_page(self, response):
        sel = Selector(text = response.body)
        infos = sel.xpath('//tr[@class="odd"]').extract()
        for info in infos:
            val = Selector(text = info)
            ip = val.xpath('//td[2]/text()').extract_first()
            port = val.xpath('//td[3]/text()').extract_first()
            country = val.xpath('//td[4]/a/text()').extract_first()
            anonymity = val.xpath('//td[5]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name,
            )

            self.add_proxy(proxy = proxy)
proxyrox.py 文件源码 项目:rental 作者: meihuanyu 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def parse_page(self, response):
        super(ProxyRoxSpider, self).parse_page(response)

        data = response.xpath('//tr[@class="fat"]').extract()
        for i, d in enumerate(data):
            sel = Selector(text = d)

            ip_port = sel.xpath('//td/a/text()').extract_first()
            ip = ip_port.split(':')[0]
            port = ip_port.split(':')[1]
            country = sel.xpath('//td/span[@class="region"]/text()').extract_first()
            anonymity = sel.xpath('//td/span/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name
            )

            self.add_proxy(proxy = proxy)
proxydb.py 文件源码 项目:rental 作者: meihuanyu 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def parse_page(self, response):
        super(ProxyDBSpider, self).parse_page(response)

        data = response.xpath('//tbody/tr').extract()
        for i, d in enumerate(data):
            sel = Selector(text = d)

            ip_port = sel.xpath('//td/a/text()').extract_first()
            ip = ip_port.split(':')[0]
            port = ip_port.split(':')[1]
            country = sel.xpath('//td/img/@title').extract_first()
            anonymity = sel.xpath('//td/span[@class="text-success"]/text()').extract_first()

            proxy = Proxy()
            proxy.set_value(
                    ip = ip,
                    port = port,
                    country = country,
                    anonymity = anonymity,
                    source = self.name
            )

            self.add_proxy(proxy = proxy)
holerite.py 文件源码 项目:cmc-transparencia-spider 作者: CodeForCuritiba 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse_salaries(self, response):
        """
        The values about person salary is in another table
        in another page, that function grab all the table headers
        and values and assign to the entity[entity_id]
        The id was passed in the response.meta
        """

        item = VereadorItem()
        item['name'] = response.meta['name']
        item['id'] = response.meta['entity_id']
        item['mesano'] = response.meta['mesano']

        for salary in response.xpath('//*[@id="holerite"]').extract():
            selector = Selector(text=salary)
            table = selector.xpath('//tr[@class="holerite_valor"]/td/text()').extract()
            item["salary_gross"] = table[0]
            item["salary_liquid"] = selector.xpath('//tr[@class="holerite_valor"]/td/strong/text()').extract_first()
            return item
poj.py 文件源码 项目:makinami 作者: Coderhypo 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def parse(self, response):
        sel = Selector(response)

        self.item = AccountItem()
        self.item['oj'] = 'poj'
        self.item['username'] = self.username
        if self.is_login:
            try:
                self.item['rank'] = sel.xpath('//center/table/tr')[1].\
                        xpath('.//td/font/text()').extract()[0]
                self.item['accept'] = sel.xpath('//center/table/tr')[2].\
                        xpath('.//td/a/text()').extract()[0]
                self.item['submit'] = sel.xpath('//center/table/tr')[3].\
                        xpath('.//td/a/text()').extract()[0]
                yield Request(self.accepted_url % self.username,
                              callback = self.accepted
                             )
                self.item['status'] = 'Authentication Success'
            except:
                self.item['status'] = 'Unknown Error'
        else:
            self.item['status'] = 'Authentication Failed'

        yield self.item
movie_spider.py 文件源码 项目:Spider_Hub 作者: WiseDoge 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def parse_item(self, response):
        item = DoubanmovieItem()
        sel = Selector(response)

        title = sel.xpath('//*[@id="content"]/h1/span[1]/text()').extract()[0]
        year = sel.xpath('//*[@id="content"]/h1/span[2]/text()').extract()[0]
        commit_num = sel.xpath(
            '//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()').extract()[0]
        star = sel.xpath(
            '//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()').extract()[0]
        director = sel.xpath(
            '//*[@id="info"]/span[1]/span[2]/a/text()').extract()[0]
        screenwriter = sel.xpath(
            '//*[@id="info"]/span[2]/span[2]/a/text()').extract()[0]

        item['title'] = title
        item['date'] = year
        item['star'] = star
        item['commit_num'] = commit_num
        item['director'] = director
        item['screenwriter'] = screenwriter

        return item
postSpider.py 文件源码 项目:JianShu-Donate 作者: whatbeg 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def parse(self, response):
        selector = Selector(response)
        articles = selector.xpath('//ul[@class="article-list thumbnails"]/li')

        for article in articles:
            item = Jianshu2Item()
            url = article.xpath('div/h4/a/@href').extract()
            likeNum = article.xpath('div/div/span[2]/text()').extract()
            posturl = 'http://www.jianshu.com'+url[0]

            if len(likeNum) == 0:
                item['likeNum'] = 0
            else:
                item['likeNum'] = int(likeNum[0].split(' ')[-1])

            request = Request(posturl,callback=self.parse_donate)
            request.meta['item'] = item
            yield request

        next_link = selector.xpath('//*[@id="list-container"]/div[@class="load-more"]/button/@data-url').extract()[0]
        if next_link:
            next_link = self.url + str(next_link)
            yield Request(next_link,callback=self.parse)
spider.py 文件源码 项目:DoubanSpyder 作者: muyeby 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def parse_article(self,response):
        hxs = Selector(response)
        keyword = response.meta['keyword']
        movie_name = hxs.xpath('//*[@id="content"]/h1/span[1]/text()').extract()
        movie_roles_paths = hxs.xpath('//*[@id="info"]/span[3]/span[2]')
        movie_roles = []
        for movie_roles_path in movie_roles_paths:
            movie_roles = movie_roles_path.select('.//*[@rel="v:starring"]/text()').extract()
        movie_classification= hxs.xpath('//span[@property="v:genre"]/text()').extract()
        douban_item = DoubanItem()
        douban_item['movie_keyword'] = keyword
        douban_item['movie_name'] = ''.join(movie_name).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';').replace(' ','')
        douban_item['movie_roles'] = ';'.join(movie_roles).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';')
        douban_item['movie_classification'] = ';'.join(movie_classification).strip().replace(',',';').replace('\'','\\\'').replace('\"','\\\"').replace(':',';')
        article_link = hxs.xpath('//*[@id="review_section"]/div/div/div/h3/a/@href').extract()
        tmp = "https://movie.douban.com/review/"
        for item in article_link:
            if tmp in item:
                yield Request(item,meta={'item': douban_item},callback=self.parse_item,cookies=[{'name': 'COOKIE_NAME','value': 'VALUE','domain': '.douban.com','path': '/'},])
pictureSpider_demo.py 文件源码 项目:PythonCrawler-Scrapy-Mysql-File-Template 作者: lawlite19 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def parse(self, response):
        se=Selector(response) #???????HtmlXPathSelector???
        if(re.match("http://desk.zol.com.cn/fengjing/\d+x\d+/\d+.html", response.url)):#??url??????????url????
            src=se.xpath("//ul[@class='pic-list2  clearfix']/li")#???ul?????li

            for i in range(len(src)):#??li??
                imgURLs=se.xpath("//ul[@class='pic-list2  clearfix']/li[%d]/a/img/@src"%i).extract() #??????????
                titles=se.xpath("//ul[@class='pic-list2  clearfix']/li[%d]/a/img/@title"%i).extract()

                if imgURLs:
                    realUrl=imgURLs[0].replace("t_s208x130c5","t_s2560x1600c5") #????????????????
                    file_name=u"%s.jpg"%titles[0] #????????

                    path=os.path.join("D:\pics",file_name)#??????????????F??pics????

                    type = sys.getfilesystemencoding()
                    print file_name.encode(type)  

                    item=WebcrawlerScrapyItem()  #??item??????item??,?????????????item???
                    item['name']=file_name 
                    item['url']=realUrl
                    print item["name"],item["url"]    

                    yield item  #??item,???????item

                    urllib.urlretrieve(realUrl,path)  #??????????????????????????????????????

            all_urls=se.xpath("//a/@href").extract()#???????url
            for url in all_urls:
                if url.startswith("/fengjing/1920x1080/"):#??????????????
                    yield Request("http://desk.zol.com.cn"+url,callback=self.parse)
fifa_spider.py 文件源码 项目:FIFA-Player-Ratings 作者: HashirZahir 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def parse(self, response):
        #obtains links from page to page and passes links to parse_playerURL
        sel = Selector(response)    #define selector based on response object (points to urls in start_urls by default) 
        url_list = sel.xpath('//tbody/tr/td[@class="player"]/a/@href')   #obtain a list of href links that contain relative links of players

        for i in url_list:
            relative_url = self.clean_str(i.extract())    #i is a selector and hence need to extract it to obtain unicode object
            print urljoin(response.url, relative_url)   #urljoin is able to merge absolute and relative paths to form 1 coherent link
            req = Request(urljoin(response.url, relative_url),callback=self.parse_playerURL)   #pass on request with new urls to parse_playerURL
            req.headers["User-Agent"] = self.random_ua()    
            yield req

        next_url=sel.xpath('//div[@class="right-nav pull-right"]/a[@rel="next"]/@href').extract_first()  
        if(next_url):                                                                       #checks if next page exists
            clean_next_url = self.clean_str(next_url)
            reqNext = Request(urljoin(response.url, clean_next_url),callback=self.parse)    #calls back this function to repeat process on new list of links
            yield reqNext
huawei_spider.py 文件源码 项目:MonkeyKing_crawler_recommender 作者: BitTigerInst 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse(self, response):
    page = Selector(response)

    hrefs = page.xpath('//h4[@class="title"]/a/@href')

    for href in hrefs:
      url = href.extract()
      yield scrapy.Request(url, callback=self.parse_item)

    div = page.xpath('//div[@class="page-ctrl ctrl-app"]')
    hrefs = div.xpath('.//a/@href').extract()

    for href in hrefs:
      url = response.urljoin(href)
      print url
      # yield scrapy.Request(url, self.parse, meta={
      #   'splash': {
      #     'endpoint': 'render.html',
      #     'args': {'wait': 0.5}
      #   }
      # })
huawei_spider.py 文件源码 项目:MonkeyKing_crawler_recommender 作者: BitTigerInst 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def parse_item(self, response):
    page = Selector(response)
    item = AppstoreItem()

    item['title'] = page.xpath('//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()').extract_first().encode('utf-8')
    item['url'] = response.url
    item['appid'] = re.match(r'http://.*/(.*)', item['url']).group(1)
    item['intro'] = page.xpath('//meta[@name="description"]/@content').extract_first().encode('utf-8')

    divs = page.xpath('//div[@class="open-info"]')
    recomm = ""
    for div in divs:
      url = div.xpath('./p[@class="name"]/a/@href').extract_first()
      recommended_appid = re.match(r'http://.*/(.*)', url).group(1)
      name = div.xpath('./p[@class="name"]/a/text()').extract_first().encode('utf-8')
      recomm += "{0}:{1},".format(recommended_appid, name)
    item['recommended'] = recomm
    yield item
huawei_spider.py 文件源码 项目:MonkeyKing_crawler_recommender 作者: BitTigerInst 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def parse_item(self, response):
    page = Selector(response)
    item = AppstoreItem()

    item['title'] = page.xpath('//ul[@class="app-info-ul nofloat"]/li/p/span[@class="title"]/text()').extract_first().encode('utf-8')
    item['url'] = response.url
    item['appid'] = re.match(r'http://.*/(.*)', item['url']).group(1)
    item['intro'] = page.xpath('//meta[@name="description"]/@content').extract_first().encode('utf-8')

    divs = page.xpath('//div[@class="open-info"]')
    recomm = ""
    for div in divs:
      url = div.xpath('./p[@class="name"]/a/@href').extract_first()
      recommended_appid = re.match(r'http://.*/(.*)', url).group(1)
      name = div.xpath('./p[@class="name"]/a/text()').extract_first().encode('utf-8')
      recomm += "{0}:{1},".format(recommended_appid, name)
    item['recommended'] = recomm
    yield item
xiaomi_spider.py 文件源码 项目:MonkeyKing_crawler_recommender 作者: BitTigerInst 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def parse_page(self, response):
        page = Selector(response)
        lis = page.xpath('//ul[@class="applist"]/li')
        if lis == None:
            return

        url_common = 'http://app.mi.com'

        for li in lis:
            item = XiaomiAppstoreCrawlerItem()
            item['title'] = li.xpath('./h5/a/text()').extract_first().encode('utf-8')
            url = li.xpath('./h5/a/@href').extract_first()
            appid = re.match(r'/detail/(.*)', url).group(1)
            item['appid'] = appid
            # import pudb; pu.db
            req = scrapy.Request(url_common + url, callback=self.parse_details)
            req.meta["item"] = item
            yield req
mogujie_mac.py 文件源码 项目:first-crawler 作者: Xinghaoz 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse_item(self, response):
        url_trim = response.url.split('?')[0]


        page = Selector(response)
        title = page.xpath('//span[@itemprop="name"]/text()').extract_first()
        images = page.xpath('//img[@id="J_BigImg"]/@src').extract_first()
        availability = page.xpath('//dd[@class="num clearfix"]/div[@class="J_GoodsStock goods-stock fl"]/text()').extract_first()
        status = response.status

        item = FashionItem()
        item['url'] = url_trim
        item['title'] = title.encode('utf-8')
        item['images'] = images
        item['availability'] = availability.encode('utf-8')
        item['status'] = status
        return item
sex_by_music.py 文件源码 项目:QQMusicSpider 作者: FanhuaandLuomu 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def getMusListToFile(qqid, line, browser, filename):
    m_url = 'http://g.gogoqq.com/music.htm?uin=%s' % qqid
    browser.get(m_url)
    #time.sleep(2)
    WebDriverWait(browser, 2, 0.5).until(lambda item:item.find_element_by_xpath('//*[@id="list"]').is_displayed())
    time.sleep(1)
    liList = Selector(text = browser.page_source).xpath(u'//*[@id="list"]/li/a')
    mList = []
    for m in liList:
        mus = m.xpath('text()')[0].extract()
        print mus  
        mList.append(mus)
    f = open(filename, 'a')
    string = line + '  #music#:' + '##m##'.join(mList)
    f.write(string + '\n')
    f.close()
douban_new_movie_spider.py 文件源码 项目:TvLive 作者: Rano1 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def parse(self, response):
        sel = Selector(response)
        movie_name = sel.xpath("//div[@class='pl2']/a/text()[1]").extract()
        movie_url = sel.xpath("//div[@class='pl2']/a/@href").extract()
        movie_star = sel.xpath("//div[@class='pl2']/div/span[@class='rating_nums']/text()").extract()

        # item = DoubanNewMovieItem()
        item = {}
        # item['movie_name'] = [n.encode('utf-8') for n in movie_name]
        item['movie_name'] = movie_name
        item['movie_star'] = [n for n in movie_star]
        item['movie_url'] = [n for n in movie_url]

        yield item

        print(item['movie_name'], item['movie_star'], item['movie_url'])
dianpingxmtbabyspider.py 文件源码 项目:spider_scrapy_lianjia 作者: stamhe 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def parse_category(self, response):
        self.log("=================================================")
        sel = Selector(response)
        shop_type = response.meta['shop_type']
        city_id = response.meta['city_id']

        cat_url = response.url
        http_status = response.status
        self.log("http_url = %s" % cat_url)
        self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy']))

        self.log("shop_type = %s" % shop_type)
        items = []
        shop_list = sel.xpath('//li[@class="t-item-box t-district J_li"]/div[@class="t-item"]/div[@class="t-list"]/ul/li')
    self.log("shop_list_len = %d" % len(shop_list))
        for shop in shop_list:
            uri = shop.xpath('a/@href').extract()[0]
            self.log("page_uri = %s" % uri)
            yield scrapy.Request('http://www.dianping.com' + uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id})
dianpingxmtgymspider.py 文件源码 项目:spider_scrapy_lianjia 作者: stamhe 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def parse_category(self, response):
        self.log("=================================================")
        sel = Selector(response)
        shop_type = response.meta['shop_type']
        city_id = response.meta['city_id']

        cat_url = response.url
        http_status = response.status
        self.log("http_url = %s" % cat_url)
        self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy']))

        self.log("shop_type = %s" % shop_type)
        items = []
        #shop_list = sel.xpath('//li[@class="t-item-box t-district J_li"]/div[@class="t-item"]/div[@class="t-list"]/ul/li')
        region_list = sel.xpath('//div[@id="region-nav"]/a')
    self.log("region_list_len = %d" % len(region_list))
        for region in region_list:
            uri = region.xpath('@href').extract()[0]
            self.log("page_uri = %s" % uri)
            #yield scrapy.Request('http://www.dianping.com' + uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id})
            yield scrapy.Request(uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id})
baiduspider.py 文件源码 项目:spider_scrapy_lianjia 作者: stamhe 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def parse(self, response):
        sel = Selector(response)

        cat_url = response.url
        http_status = response.status
        self.log("http_url = %s" % cat_url)
        self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy']))

        item = SpiderDianpingXmtItem()
        item['chenshi_name']    = "" 
        item['shop_type']       = 0
        item['shop_url']        = ""
        item['shop_name']       = ""
        item['shop_addr']       = ""
        item['shop_mobile']     = ""
        item['shop_intro']      = ""

        return item
dianpingxmtspider.py 文件源码 项目:spider_scrapy_lianjia 作者: stamhe 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def parse(self, response):
        sel = Selector(response)
        if response.meta.has_key("shop_type"):
            shop_type = response.meta['shop_type']
        else:
            shop_type = self.shop_type_map[response.url]['shop_type']

        if response.meta.has_key("city_id"):
            city_id = response.meta['city_id']
        else:
            city_id = self.shop_type_map[response.url]['city_id']

        cat_url = response.url
        http_status = response.status
        self.log("http_status = %s proxy = %s" % (http_status, response.meta['proxy']))

        self.log("shop_type = %s" % shop_type)
        items = []
        shop_list = sel.xpath('//div[@id="region-nav"]/a')
        for shop in shop_list:
            uri = shop.xpath('@href').extract()[0]
            self.log("page_uri = %s" % uri)
            yield scrapy.Request('http://www.dianping.com' + uri, callback=self.parse_list, meta={'shop_type':shop_type, 'cat_url' : cat_url, 'city_id' : city_id})
lianjiaspider.py 文件源码 项目:spider_scrapy_lianjia 作者: stamhe 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse(self, response):
        sel = Selector(response)
        xiaoqu_uri = sel.xpath('//span[@class="title"]/a/@href').extract()[0]
        xiaoqu_list = xiaoqu_uri.split('/')
        xiaoqu_id   = xiaoqu_list[2]
        items = []
        house_lists = sel.xpath('//div[@class="list-wrap"]/ul[@class="house-lst"]/li')
        for house in house_lists:
            item = SpiderScrapyLianjiaItem()
            item['xiaoqu_id']   = xiaoqu_id
            item['house_id']    = house.xpath('@data-id').extract()[0]
            item['title']       = house.xpath('div[@class="info-panel"]/h2/a/text()').extract()[0]
            item['price']       = house.xpath('div[@class="info-panel"]/div[@class="col-3"]/div[@class="price"]/span/text()').extract()[0]
            item['view_count']  = house.xpath('div[@class="info-panel"]/div[@class="col-2"]/div[@class="square"]/div/span/text()').extract()[0]
            #item['size']        = house.xpath('div[@class="info-panel"]/div[@class="col-1"]/div[@class="where"]/span/text()').extract()
            items.append(item)

        return items
spider.py 文件源码 项目:cl1024 作者: wuchujiecode 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def parse_item(self, response):
        item = Cl1024Item()
        item['cl_title'] = response.meta['cl_title']
        item['cl_url'] = response.meta['cl_url']
        item['cl_bankuai'] = response.meta['cl_bankuai']
        item['posted'] = response.meta['posted']
        # redownloaded = re.search('downloaded:(.+?)<BR>', response.body)
        # downloaded = redownloaded[12:-4]
        sel = Selector(response)
        downloaded = sel.xpath('//td/table/tr/td/text()').extract()[2]
        item['torrent_downloaded'] = downloaded[17:]
        item['torrent_url'] = response.url
        ref = sel.xpath('//input[@name="ref"]/@value').extract_first()
        reff = sel.xpath('//input[@name="reff"]/@value').extract_first()

        dl = ('http://www.rmdown.com/download.php?ref=%s&&reff=%s&submit=download' % (ref, reff)).encode('utf-8')
        item['torrent_download_urls'] = dl

        yield item


问题


面经


文章

微信
公众号

扫码关注公众号