python类selector()的实例源码-面圈网

holerite.py 文件源码项目：cmc-transparencia-spider 作者: CodeForCuritiba 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def parse_salaries(self, response):
        """
        The values about person salary is in another table
        in another page, that function grab all the table headers
        and values and assign to the entity[entity_id]
        The id was passed in the response.meta
        """

        item = VereadorItem()
        item['name'] = response.meta['name']
        item['id'] = response.meta['entity_id']
        item['mesano'] = response.meta['mesano']

        for salary in response.xpath('//*[@id="holerite"]').extract():
            selector = Selector(text=salary)
            table = selector.xpath('//tr[@class="holerite_valor"]/td/text()').extract()
            item["salary_gross"] = table[0]
            item["salary_liquid"] = selector.xpath('//tr[@class="holerite_valor"]/td/strong/text()').extract_first()
            return item

postSpider.py 文件源码项目：JianShu-Donate 作者: whatbeg 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def parse(self, response):
        selector = Selector(response)
        articles = selector.xpath('//ul[@class="article-list thumbnails"]/li')

        for article in articles:
            item = Jianshu2Item()
            url = article.xpath('div/h4/a/@href').extract()
            likeNum = article.xpath('div/div/span[2]/text()').extract()
            posturl = 'http://www.jianshu.com'+url[0]

            if len(likeNum) == 0:
                item['likeNum'] = 0
            else:
                item['likeNum'] = int(likeNum[0].split(' ')[-1])

            request = Request(posturl,callback=self.parse_donate)
            request.meta['item'] = item
            yield request

        next_link = selector.xpath('//*[@id="list-container"]/div[@class="load-more"]/button/@data-url').extract()[0]
        if next_link:
            next_link = self.url + str(next_link)
            yield Request(next_link,callback=self.parse)

music.py 文件源码项目：MusicSpider 作者: StarkBoLun 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def list_parse(self, response):
        selector = Selector(text=response.body)
        list = selector.xpath("//li//a[@class='msk']/@title")
        urls = selector.xpath("//a[@class='zpgi']/@href").extract()
        start_url = "http://music.163.com"
        for tmp_url in urls:
            yield scrapy.Request(url=start_url + tmp_url, method="GET", callback=self.list_parse,
                                 meta={"cat": response.meta['cat']})
        i = 1
        for tmp in list:
            list_id = selector.xpath("//li[" + str(i)
                                             + "]//a[@class='icon-play f-fr']/@data-res-id").extract_first()
            i = i + 1
            # ????
            yield scrapy.Request(url=start_url+"/playlist?id="+list_id, method="GET", callback=self.play_list_parse,
                                 meta={"cat": response.meta['cat'], "id": list_id})

BFS.py 文件源码项目：Crawling-SinaWeibo 作者: Uzumaki-C 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def parse(self, response):
        selector = Selector(response)
        ID = response.meta["ID"]
        text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
        info = InfoItem()
        if text0:
            num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0)  # ???
            num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0)  # ???
            num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0)  # ???

            if num_tweets:
                info["num_tweets"] = int(num_tweets[0])
            if num_follows:
                info["num_follows"] = int(num_follows[0])
            if num_fans:
                info["num_fans"] = int(num_fans[0])

        url_information1 = "http://weibo.cn/%s/info" % ID
        yield Request(url=url_information1, meta={"item":info,"ID":ID}, dont_filter=True, callback=self.parse1)

MHRW.py 文件源码项目：Crawling-SinaWeibo 作者: Uzumaki-C 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def parse(self, response):
        selector = Selector(response)
        ID = response.meta["ID"]
        text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
        info = InfoItem()
        if text0:
            num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0)  # ???
            num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0)  # ???
            num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0)  # ???

            if num_tweets:
                info["num_tweets"] = int(num_tweets[0])
            if num_follows:
                info["num_follows"] = int(num_follows[0])
            if num_fans:
                info["num_fans"] = int(num_fans[0])

        url_information1 = "http://weibo.cn/%s/info" % ID
        yield Request(url=url_information1, meta={"item":info,"ID":ID}, dont_filter=True, callback=self.parse1)

MHRW.py 文件源码项目：Crawling-SinaWeibo 作者: Uzumaki-C 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def parse3_fans(self, response):
        """ ????????????ID """
        selector = Selector(response)
        text2 = selector.xpath('body//table/tr/td/a/@href').extract()
        url_main = response.meta["url_main"]
        ID_ = response.meta["ID"]
        for elem in text2:
            elem = re.findall('uid=(\d+)', elem)
            if elem:
                ID = int(elem[0])
                if ID not in self.friends_id:  # ??ID????????????
                    self.friends_id.add(ID)
        url_next = selector.xpath(
            u'body//div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href').extract()
        if url_next:
            yield Request(url="http://weibo.cn%s" % url_next[0], meta={"url_main":url_main,"ID":ID_}, callback=self.parse3_fans)
        else:
            self.fans_finish = True
            if self.fans_finish and self.follows_finish:
                yield Request(url=url_main, meta={"ID":ID_}, dont_filter=True, callback=self.parse)

MHRW.py 文件源码项目：Crawling-SinaWeibo 作者: Uzumaki-C 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def parse3_follows(self, response):
        """ ????????????ID """
        selector = Selector(response)
        text2 = selector.xpath('body//table/tr/td/a/@href').extract()
        url_main = response.meta["url_main"]
        ID_ = response.meta["ID"]
        for elem in text2:
            elem = re.findall('uid=(\d+)', elem)
            if elem:
                ID = int(elem[0])
                if ID not in self.friends_id:  # ??ID????????????
                    self.friends_id.add(ID)
        url_next = selector.xpath(
            u'body//div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href').extract()
        if url_next:
            yield Request(url="http://weibo.cn%s" % url_next[0], meta={"url_main":url_main,"ID":ID_}, callback=self.parse3_follows)
        else:
            self.follows_finish = True
            if self.fans_finish and self.follows_finish:
                yield Request(url=url_main, meta={"ID":ID_}, dont_filter=True, callback=self.parse)

song.py 文件源码项目：cloudmusic_api 作者: yqh231 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def parse_single_song(self, response):
        loader = response.meta['loader']
        selector = Selector(response)
        singer = selector.xpath('//title/text()').extract()
        loader.add_value('singer', singer)
        loader.add_value('_id', response.meta['song_id'])

        comment_data, comment_url = api_comment(response.meta['song_id'], 0, 100)
        source_data, source_url = api_song_url(response.meta['song_id'])
        comment_id = generate_comment_index()['comment_index']
        loader.add_value('comment_id', comment_id)


        yield scrapy.FormRequest(url=comment_url, method='POST', headers=self.headers,
                                 formdata=comment_data, callback=self.parse_comments,
                                 meta={'comment_id': comment_id})

        yield scrapy.FormRequest(url=source_url, method='POST', headers=self.headers,
                                 formdata=source_data, meta={'loader': loader}, callback=self.get_source_url)

holerite.py 文件源码项目：cmc-transparencia-spider 作者: CodeForCuritiba 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def parse_entities(self, response):
        """
        A table is displayed with the data about the person
        who works at the Câmara
        """

        mesano = response.meta['mesano']

        self.log('Getting mesano: ' + mesano)

        # Check if the table is empty
        if not response.css('table tr td:nth-child(1)').extract_first():
            return self.log('Nenhum dado disponível')

        for tr in response.xpath('//table/tr').extract():
            selector = Selector(text=tr)
            entity_id = re.search("(javascript:pesquisa\()(\d*)(\);)", tr).group(2)

            request = scrapy.FormRequest(
                url=BASE_URL + 'holerite/consulta_beneficiario.html',
                formdata={
                    'hol_ben_id': entity_id,
                    'hol_mesano': mesano,
                    'hol_tipo': '1',
                    'hol_grupo': GRUPO,
                    'acao':''
                },
                callback=self.parse_salaries
            )

            request.meta['name'] = selector.xpath("//tr/td/text()").extract_first()
            request.meta['entity_id'] = entity_id
            request.meta['mesano'] = mesano

            yield request

sisy_spider.py 文件源码项目：scrapy-image 作者: lamphp 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def parse_item(self, response):

        selector = Selector(response).xpath('//p[@align="center"]')
        for sel in selector:
            image_urls = sel.xpath('a/img/@src').extract()
            path = []
            for img in image_urls:
                path.append(urlparse.urlparse(img).path)

        item = SisyItem()                             
        item['image_urls'] = image_urls
        item['images']     = path

        return item

doubanBookSpider.py 文件源码项目：doubanBook 作者: YangShuqing 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def parse(self, response):
        selector = Selector(response)
        books = selector.xpath('//div[@class="info"]/h2/a/@href').extract()
        for book in books:
            print book
            yield Request(book, callback=self.parse_item)

        nextPage = selector.xpath('//span[@class="next"]/a/@href').extract()
        if nextPage:
            print nextPage[0]
            yield Request(self.url+nextPage[0],callback=self.parse)

shenZhouCars.py 文件源码项目：RentMe 作者: YunPengZ 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def parse(self, response):
        # print response.body
        value = shenZhouCarsItem()
        item = fieldsItem()
        selector = Selector(response)
        cars = selector.xpath('//ul[@class="carInfor-xj clearfix"]')
        for index in range(0, len(cars), 2):
            basic = cars[index]
            specific = cars[index+1]
            item['car_brand'] = basic.xpath('li[1]/span[1]/text()').re(r'\s+(.*)')[0]
            item['car_series'] = basic.xpath('li[2]/span/text()').re(r'\s+(.*)')[0]
            item['car_issue_date'] = basic.xpath('li[3]/span/text()').re(r'\s+(.*)')[0]
            item['car_config_model'] = basic.xpath('li[4]/span/text()').re(r'\s+(.*)')[0]
            item['car_seats_num'] = specific.xpath('li[1]/span/text()').re(r'\s+(.*)')[0]
            item['car_doors'] = specific.xpath('li[2]/span/text()').re(r'\s+(.*)')[0]
            item['car_fuel_type'] = specific.xpath('li[3]/span/text()').re(r'\s+(.*)')[0]
            item['car_gearbox_type'] = specific.xpath('li[4]/span/text()').re(r'\s+(.*)')[0]
            item['car_displacement'] = specific.xpath('li[5]/span/text()').extract()[0]
            item['car_fuel_num'] = specific.xpath('li[6]/span/text()').re(r'\s+(.*)')[0]
            item['car_drive_way'] = specific.xpath('li[7]/span/text()').re(r'\s+(.*)')[0]
            item['car_engine_intake'] = specific.xpath('li[8]/span/text()').re(r'\s+(.*)')[0]
            item['car_skylight'] = specific.xpath('li[9]/span/text()').re(r'\s+(.*)')[0]
            item['car_tank_capa'] = specific.xpath('li[10]/span/text()').re(r'\s+(.*)')[0]
            item['car_voicebox'] = specific.xpath('li[11]/span/text()').re(r'^\s+(\w*)')[0]
            item['car_seats_type'] = specific.xpath('li[12]/span/text()').re(r'\s+(.*)')[0]
            item['car_reverse_radar'] = specific.xpath('li[13]/span/text()').re(r'\s+(.*)')[0]
            item['car_airbag'] = specific.xpath('li[14]/span/text()').re(r'\s+(\w*)')[0]
            item['car_dvd'] = specific.xpath('li[15]/span/text()').re(r'\s+(.*)')[0]
            item['car_gps'] = specific.xpath('li[16]/span/text()').re(r'\s+(.*)')[0]
            if item['car_airbag'] == u'6510'
                item['car_airbag'] = "0"

            value['model'] = 'RentMe.model_info'
            value['pk'] = item['car_brand']+item['car_series']+item['car_issue_date']+item['car_config_model']
            value['fields'] = {'car_brand': item['car_brand'], 'car_series': item['car_series'], 'car_issue_date': item['car_issue_date'], 'car_config_model': item['car_config_model'], 'car_seats_num': item['car_seats_num'], 'car_doors': item['car_doors'], 'car_fuel_type': item['car_fuel_type'], 'car_gearbox_type': item['car_gearbox_type'], 'car_displacement': item['car_displacement'], 'car_fuel_num': item['car_fuel_num'], 'car_drive_way': item['car_drive_way'], 'car_engine_intake': item['car_engine_intake'], 'car_skylight': item['car_skylight'], 'car_tank_capa': item['car_tank_capa'], 'car_voicebox': item['car_voicebox'], 'car_seats_type': item['car_seats_type'], 'car_reverse_radar': item['car_reverse_radar'], 'car_airbag': item['car_airbag'], 'car_dvd': item['car_dvd'], 'car_gps': item['car_gps'], 'car_deposit': 5000, 'car_day_price': 100, 'car_time_out_price': 150, 'car_over_kilo_price': 0.5}
        yield value

design_picture_spider.py 文件源码项目：decoration-design-crawler 作者: imflyn 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def parse_list(self, response):
        selector = Selector(response)
        items_selector = selector.xpath('//div[@class="xmp_container"]//div[@class="item"]')
        for item_selector in items_selector:
            # http://xiaoguotu.to8to.com/c10037052.html
            cid = item_selector.xpath('div//a/@href').extract()[0][2:-6]
            title = item_selector.xpath('div//a/@title').extract()[0]
            # http://xiaoguotu.to8to.com/getxgtjson.php?a2=0&a12=&a11=10037052&a1=0
            next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + '/getxgtjson.php?a2=0&a12=&a11={cid}&a1=0').format(cid=cid)
            yield scrapy.Request(next_url, self.parse_content, meta={'cid': cid, 'title': title})

design_topic_spider.py 文件源码项目：decoration-design-crawler 作者: imflyn 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def parse_list(self, response):
        selector = Selector(response)
        items_selector = selector.xpath('//div[@class="xgt_topic"]')
        for item_selector in items_selector:
            # /topic/7334.html
            href = item_selector.xpath('div//a/@href').extract()[0]
            href = href.strip()
            # http://xiaoguotu.to8to.com/topic/7334.html
            next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + href)
            if self.design_topic_service.is_duplicate_url(next_url):
                continue
            yield scrapy.Request(next_url, self.parse_content)

design_topic_spider.py 文件源码项目：decoration-design-crawler 作者: imflyn 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def parse_content(self, response):
        selector = Selector(response)
        title = selector.xpath('//div[@class="xdb_title"]/h1/text()').extract()[0]
        description = selector.xpath('//div[@class="xdbc_description"]//div//p/text()').extract()[0]
        items_selector = selector.xpath('//div[@class="xdbc_main_content"]//p')
        article = []
        text = ''
        for index, item_selector in enumerate(items_selector):
            try:
                text = item_selector.xpath('span/text()').extract()[0]
            except IndexError:
                try:
                    img_url = item_selector.xpath('img/@src').extract()[0]
                    img_width = 0
                    try:
                        img_width = item_selector.xpath('img/@width').extract()[0]
                    except IndexError:
                        pass
                    img_height = 0
                    try:
                        img_height = item_selector.xpath('img/@height').extract()[0]
                    except IndexError:
                        pass
                    article.append({'content': text, 'img_url': img_url, 'img_width': img_width, 'img_height': img_height})
                except IndexError:
                    continue
        design_topic_item = DesignTopicItem()
        design_topic_item['title'] = title
        design_topic_item['description'] = description
        design_topic_item['article'] = article
        design_topic_item['html_url'] = response.url
        return design_topic_item

design_strategy_spider.py 文件源码项目：decoration-design-crawler 作者: imflyn 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def parse_list(self, response):
        selector = Selector(response)
        items_selector = selector.xpath('//div[@id="listITme"]//div[@class="gl-listItem"]')
        for item_selector in items_selector:
            id = item_selector.xpath('a/@href').extract()[0].replace('/strategy/', '')
            # http://guju.com.cn/strategy/strategy_getStrategyInfo_ajax?strategyModel.id=4498
            next_url = (constant.PROTOCOL_HTTP + self.start_url_domain + '/strategy/strategy_getStrategyInfo_ajax?strategyModel.id={id}').format(
                id=id)
            if self.design_strategy_service.is_duplicate_url(next_url):
                log.info("=================???" + next_url + "===========")
                continue
            yield scrapy.Request(next_url, self.parse_content, meta={'id': id})

baiduTopStockSpider.py 文件源码项目：stockSpider 作者: mizhdi 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def parse(self, response):
        # print(response, type(response))
        # from scrapy.http.response.html import HtmlResponse
        item = TopStockItem()
        selector = Selector(response)
        stocks = selector.xpath('//td[@class="keyword"]/a[@class="list-title"]')

        for index, stock in enumerate(stocks):
            item['name'] = stock.xpath('text()').extract()[0]
            item['num'] = index + 1
            item['source'] = "baidu"

            yield item

xueqiuPostSpider.py 文件源码项目：stockSpider 作者: mizhdi 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def parse_detail(self, response):
        url = urlparse.urlparse(response.url)
        path = url.path.split("/")

        item = PostItem()
        selector = Selector(response)

        item['postId'] = path[2]
        item['authorId'] = path[1]
        item['postDetail'] = selector.xpath('//div[@class="detail"]').extract()[0]

        yield item

music.py 文件源码项目：MusicSpider 作者: StarkBoLun 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def play_list_parse(self, response):
        start_url = "http://music.163.com"
        item = playListItem()
        selector = Selector(text=response.body)
        item['list_play'] = int(selector.xpath("//strong[@id='play-count']/text()").extract_first())
        item['list_collection'] = int(selector.xpath("//a[@class='u-btni u-btni-fav ']/@data-count").extract_first())
        # item['list_comment'] = int(selector.xpath("//span[@id='cnt_comment_count']/text()").extract_first())
        item['list_name'] = selector.xpath("//h2[@class='f-ff2 f-brk']/text()").extract_first()
        item['list_id'] = response.meta['id']
        item['list_tag'] = selector.xpath("//a[@class='u-tag']/i/text()").extract()
        item['list_creator'] = selector.xpath("//span[@class='name']/a/text()").extract_first()
        item['list_creator_id'] = selector.xpath("//span[@class='name']/a/@href").extract_first()
        item['type'] = response.meta['cat']
        # urls = selector.xpath("//ul[@class='f-hide']/li/a/@href").extract()
        # for url in urls:
        #     yield scrapy.Request(url=start_url + url, method="GET", callback=self.detail_parse)
        yield item

  # def detail_parse(self, response):
  #    selector = Selector(text=response.body)
  #   id = selector.xpath("//div[@id='content-operation']/@data-rid").extract_first()
  #      detail = validate.Validate(str(id))
  #      info = demjson.decode(detail.get_music_json())
  #      if info['total'] > 10000:
  #          item = detailItem()
  #          item['music_id'] = id
  #          item['music_name'] = selector.xpath("//em[@class='f-ff2']/text()").extract_first()
  #          item['music_album'] = selector.xpath("//p[@class='des s-fc4']/a/text()").extract_first()
  #          item['music_artist'] = selector.xpath("//p[@class='des s-fc4']/span/@title").extract_first()
  #          item['music_comment_num'] = int(info['total'])
  #          item['music_comment'] = info['hotComments']
  #          yield item

MovieSpider.py 文件源码项目：crawler 作者: Yabea 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def parse(self,response):
        item = DoubanmovieItem()
        selector = Selector(response)
        movies = selector.xpath('//div[@class="info"]')
        for eachmovie in movies:
            title = eachmovie.xpath('div[@class="hd"]/a/span/text()').extract()
            fullTitle = ''
            for each in fullTitle:
                fullTitle += each

            movieInfo = eachmovie.xpath('div[@class="bd"]/p/text()').extract()
            star = eachmovie.xpath('div[@class="bd"]/div[@class="star"]/span/text()').extract()[0]
            quote = eachmovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()
            if quote:
                quote = quote[0]
            else:
                quote = ''

            item['title'] = title
            item['movieInfo'] = ';'.join(movieInfo)
            item['star'] = star
            item['quote'] = quote

            yield item

        nextlink = selector.xpath('//span[@class="next"]/link/@herf').extract()
        if nextlink:
            nextlink = nextlink[0]
            print nextlink
            #Request,?????????????????
            yield Request(self.url + nextlink,callback=self.parse)

Python_Spider.py 文件源码项目：crawler 作者: Yabea 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def parse(self, response):
        item = ZhihupythonItem()
        #selector = Selector(response)
        question_Field = response.xpath('//div[@class="feed-main"]')
        for each in question_Field:
            question = each.xpath('div[@class="content"]/h2/a/text()')
            print question
            item['Question'] = question
            yield item

BFS.py 文件源码项目：Crawling-SinaWeibo 作者: Uzumaki-C 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def parse3(self, response):
        """ ????????????ID """
        selector = Selector(response)
        text2 = selector.xpath('body//table/tr/td/a/@href').extract()

        for elem in text2:
            elem = re.findall('uid=(\d+)', elem)
            if elem:
                ID = int(elem[0])
                if ID not in self.finish_ID:  # ??ID????????????
                    self.scrawl_ID.append(ID)
        url_next = selector.xpath(
            u'body//div[@class="pa" and @id="pagelist"]/form/div/a[text()="\u4e0b\u9875"]/@href').extract()
        if url_next:
            yield Request(url="http://weibo.cn%s" % url_next[0], callback=self.parse3)

dmoz_spider.py 文件源码项目：Crawling-SinaWeibo 作者: Uzumaki-C 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def parse(self, response):
        selector = Selector(response)
        text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
        info = InfoItem()
        if text0:
            num_tweets = re.findall(u'\u5fae\u535a\[(\d+)\]', text0)  # ???
            num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0)  # ???
            num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0)  # ???

            if num_tweets:
                info["num_tweets"] = int(num_tweets[0])
            if num_follows:
                info["num_follows"] = int(num_follows[0])
            if num_fans:
                info["num_fans"] = int(num_fans[0])

        url_information1 = "http://weibo.cn/%s/info" % self.next_ID[-1]
        yield Request(url=url_information1, meta={"item":info,"ID":self.next_ID[-1]}, dont_filter=True, callback=self.parse1)


        # ???????????
        if random.random() > float(info["num_follows"])/(info["num_follows"] + info["num_fans"]):
            try:
                url_fans = "http://weibo.cn/%s/fans" % self.next_ID[-1]
                yield Request(url=url_fans, dont_filter=True, callback=self.parse3)  # ????
            except:
                url_follows = "http://weibo.cn/%s/follow" % self.next_ID[-1]
                yield Request(url=url_follows, dont_filter=True, callback=self.parse3)  # ?????
        else:
            try:
                url_follows = "http://weibo.cn/%s/follow" % self.next_ID[-1]
                yield Request(url=url_follows, dont_filter=True, callback=self.parse3)  # ?????
            except:
                url_fans = "http://weibo.cn/%s/fans" % self.next_ID[-1]
                yield Request(url=url_fans, dont_filter=True, callback=self.parse3)  # ????

MHRW.py 文件源码项目：Crawling-SinaWeibo 作者: Uzumaki-C 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def parse4(self, response):
        """ ????????????? """
        selector = Selector(response)
        text0 = selector.xpath('body/div[@class="u"]/div[@class="tip2"]').extract_first()
        if text0:
            num_follows = re.findall(u'\u5173\u6ce8\[(\d+)\]', text0)  # ???
            num_fans = re.findall(u'\u7c89\u4e1d\[(\d+)\]', text0)  # ???

            if num_follows and num_fans:
                self.degree_v = num_fans + num_follows
            else:
                self.degree_v = False

song.py 文件源码项目：cloudmusic_api 作者: yqh231 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def get_list_id(self, response):
        selector = Selector(response)
        # ?????????????

        url_list = selector.xpath('//body//a[@class="s-fc0"]/@href')[:-1].extract()
        type_ = 0
        for url in url_list:
            type_ += 1
            yield scrapy.FormRequest(url='http://music.163.com/m{}'.format(url), method='GET',
                                     callback=self.parse_song_list, headers=self.headers, meta={'type': type_})

song.py 文件源码项目：cloudmusic_api 作者: yqh231 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def parse_song_list(self, response):
        selector = Selector(response)
        song_name_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/text()').extract()
        song_id_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/@href').extract()

        for index, id_ in enumerate(song_id_list):
            l = ItemLoader(item=SongListItem())
            l.add_value('song_name', song_name_list[index])
            l.add_value('type', response.meta['type'])
            yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET',
                                     headers=self.headers, callback=self.parse_single_song)

playlist.py 文件源码项目：cloudmusic_api 作者: yqh231 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def get_list_id(self, response):
        selector = Selector(response)
        # ?????????????

        url_list = selector.xpath('//body//p[@class="dec"]/a/@href').extract()

        for url in url_list:

            yield scrapy.FormRequest(url='http://music.163.com/m{}'.format(url), method='GET',
                                     callback=self.parse_song_list, headers=self.headers)

playlist.py 文件源码项目：cloudmusic_api 作者: yqh231 项目源码文件源码阅读 36 收藏 0 点赞 0 评论 0

def parse_song_list(self, response):
        selector = Selector(response)

        song_name_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/text()').extract()
        song_id_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/@href').extract()
        title = selector.xpath('//title/text()').extract()
        for index, id_ in enumerate(song_id_list):
            l = ItemLoader(item=PlayListItem())
            l.add_value('song_name', song_name_list[index])
            l.add_value('title', title)
            yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET',
                                     headers=self.headers, callback=self.parse_single_song)

polyvore_spider.py 文件源码项目：BeautyProductsScraping 作者: kaflesudip 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def parse_items(self, response):
        print "------------"
        print(response.url)
        print("----------")
        from scrapy.selector import Selector
        import json
        category = response.meta['category']['category']
        sub_category = response.meta['category']['sub_category']

        response_json = json.loads(response.body)
        required_text = response_json["result"]["html"]
        response = Selector(text=required_text)
        all_items = response.xpath('//div[contains(@class, "grid_item")]')
        for each_item in all_items:
            name = each_item.xpath('.//div[@class="title"]/a/text()').extract_first()
            price = each_item.xpath('.//span[@class="price"]/text()').extract_first()
            image_urls = [each_item.xpath(".//img/@src").extract_first()]
            affiliate_link = each_item.xpath(".//a/@href").extract_first()
            website = "polyvore.com"
            brand = [i for i in ALL_BRANDS if i.lower() in name.lower()]
            if brand:
                brand = brand[0]
                print ("brand", brand)
            else:
                print (name, brand, "exited")
                continue
            item = ProductItem(
                name=name.strip(),
                price=price.strip(),
                image_urls=image_urls,
                brand=brand.strip(),
                affiliate_link=affiliate_link,
                category=category,
                sub_category=sub_category,
                website=website
            )
            yield item
        if response_json["result"]["more_pages"] == "1":
            next_page = int(response_json["result"]["page"]) + 1
        else:
            return
        next_link = url_to_use.format(str(next_page), urllib.quote(sub_category))
        my_request = scrapy.Request(
            next_link,
            self.parse_items)
        my_request.meta['category'] = {
            "sub_category": sub_category,
            "category": category,
        }
        yield my_request

BFS.py 文件源码项目：Crawling-SinaWeibo 作者: Uzumaki-C 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def parse1(self, response):

        selector = Selector(response)
        infoItem = response.meta["item"]
        ID = response.meta["ID"]
        text1 = ";".join(selector.xpath('body/div[@class="c"]/text()').extract())  # ????????text()
        nickname = re.findall(u'\u6635\u79f0[:|\uff1a](.*?);', text1)  # ??
        gender = re.findall(u'\u6027\u522b[:|\uff1a](.*?);', text1)  # ??
        place = re.findall(u'\u5730\u533a[:|\uff1a](.*?);', text1)  # ???????????
        signature = re.findall(u'\u7b80\u4ecb[:|\uff1a](.*?);', text1)  # ????
        birthday = re.findall(u'\u751f\u65e5[:|\uff1a](.*?);', text1)  # ??
        sexorientation = re.findall(u'\u6027\u53d6\u5411[:|\uff1a](.*?);', text1)  # ???
        marriage = re.findall(u'\u611f\u60c5\u72b6\u51b5[:|\uff1a](.*?);', text1)  # ????
        url = re.findall(u'\u4e92\u8054\u7f51[:|\uff1a](.*?);', text1)  # ????

        if nickname:
            infoItem['nickname'] = nickname[0]
        if gender:
            infoItem['gender'] = gender[0]
        if place:
            place = place[0].split(" ")
            infoItem["province"] = place[0]
            if len(place) > 1:
                infoItem["city"] = place[1]
        if signature:
            infoItem["signature"] = signature[0]
        if birthday:
            try:
                birthday = datetime.datetime.strptime(birthday[0], "%Y-%m-%d")
                infoItem["birthday"] = birthday - datetime.timedelta(hours=8)
            except Exception:
                pass
        if sexorientation:
            if sexorientation[0] == gender[0]:
                infoItem["sexorientation"] = "gay"
            else:
                infoItem["sexorientation"] = "Heterosexual"
        if marriage:
            infoItem["marriage"] = marriage[0]
        if url:
            infoItem["url"] = url[0]

        infoItem["user_id"] = ID

        yield infoItem

        ############??#########
        if len(self.scrawl_ID) > 0:
            ID = self.scrawl_ID.popleft()
            self.finish_ID.add(ID)
            url_main = "http://weibo.cn/u/%s" % ID
            url_fans = "http://weibo.cn/%s/fans" % ID
            url_follows = "http://weibo.cn/%s/follow" % ID
            # ???????????
            if len(self.scrawl_ID) < 4:
                yield Request(url=url_fans, dont_filter=True, callback=self.parse3)  # ????
                yield Request(url=url_follows, dont_filter=True, callback=self.parse3)  # ?????
            yield Request(url=url_main, meta={"ID":ID}, dont_filter=True, callback=self.parse)