python类ItemLoader()的实例源码

hupuScrapy.py 文件源码 项目:BaymaxHome 作者: tyhtao1990 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def parse(self, response):
        mongoClient = mongodb_client('localhost', 27017)
        list = []
        print "************************"
        # print response.xpath('//div[@class="about_fonts clearfix"]/p[@class="time_f"]/text()').extract()
        player_away = response.xpath('//table[@id="J_away_content"]/tbody/tr')
        player_home = response.xpath('//table[@id="J_home_content"]/tbody/tr')
        if player_away:
            for player in player_away:
                playerName=player.xpath('td/a/text()').extract()
                if playerName:
                    list.append(playerName + player.xpath('td/text()').extract())
            for player in player_home:
                playerName = player.xpath('td/a/text()').extract()
                if playerName:
                    list.append(playerName + player.xpath('td/text()').extract())
            # print response.xpath('//div[@class="message"]/h2/text()').extract()
            print "************************"

            time = response.xpath('//div[@class="about_fonts clearfix"]/p[@class="time_f"]/text()').extract()
            team = response.xpath('//div[@class="message"]/p/a/text()').extract()
            score = response.xpath('//div[@class="message"]/h2/text()').extract()
            url = response.url

            g = game(time, team, score, list, url)
            print g.__dict__
            # json_g = parsejson(g)
            # print json_g

            # g = ItemLoader(game(), response=response)
            # g.add_xpath('time', '//div[@class="about_fonts clearfix"]/p[@class="time_f"]/text()')
            # g.add_xpath('team', '//div[@class="message"]/p/a/text()')
            # g.add_xpath('score', '//div[@class="message"]/h2/text()')
            # g.add_value('players', list)
            # return g.load_item()

            client = mongoClient.connect()
            db = mongoClient.useDB(client, "hupu_data")
            print mongoClient.insert_one(db, "games", g.__dict__)
zhihu.py 文件源码 项目:django-scrapy-lcv_search 作者: Albino1995 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def parse_question(self, response):
        # ??question????????????question item
        question_id = response.meta.get("zhihu_id", "")
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_css("title", "h1.QuestionHeader-title::text")
        item_loader.add_css("content", ".QuestionHeader-detail")
        item_loader.add_value("url", response.url)
        item_loader.add_value("zhihu_id", question_id)
        item_loader.add_css("answer_num", ".List-headerText span::text")
        item_loader.add_css("comments_num", ".QuestionHeader-actions button::text")
        item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
        item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")

        question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer)
        yield question_item
example.py 文件源码 项目:scrapy_redis_spider 作者: lymlhhj123 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def _extract_item(self, response):
        #?scrapy shell???response
        #inspect_response(response, self)

        #???????scrapy????response?????????????
        #open_in_browser(response)

        #???????
        l = ItemLoader(response=response, item=MyspiderItem(), type='html')
        l.add_xpath('movie_name', '//h1/span[@property="v:itemreviewed"]/text()')
        l.add_xpath('movie_year', '//span[@property="v:initialReleaseDate"]/text()')
        l.add_xpath('movie_type', '//span[@property="v:genre"]/text()')
        l.add_xpath('movie_rate', '//strong[@class="ll rating_num"]/text()')
        l.add_value('url', response.url)
        #????????????load_item()????scrapy.Item??
        #?scrapy-redis????json?item???????redis?item???
        #??json?????python?????????????item?????
        return dict(l.load_item())
com5442.py 文件源码 项目:ScrapyImage 作者: donnki 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def parse_first_page(self, response):
        count = int(response.xpath('//div[@id="aplist"]/ul/li[1]/a/text()')[0].re(r'.*?(\d+).*?')[0])
        title = response.request.cookies['title']
        albumURL = response.url.replace(".html", '')
        for x in xrange(1,count+1):
            suffix = ".html"
            if x > 1:
                suffix = "_"+str(x)+".html"
                request = scrapy.Request(albumURL+suffix, callback=self.parse_item, cookies={'title': title})
                yield request
        l = ItemLoader(item=PageItem(), response=response)
        l.add_value('title', title)
        l.add_value('name', self.name)
        l.add_value('url', response.url)
        l.add_xpath('image_urls', '//p[@id="contents"]/a/img/@src')
        yield l.load_item()
tuba77.py 文件源码 项目:ScrapyImage 作者: donnki 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def parse_first_page(self, response):
        count = int(response.xpath('//ul[@class="image"]/text()')[0].re(r'.*?(\d+).*?')[0])
        title = response.request.cookies['title']
        albumURL = response.url.replace(".shtml", '')
        # print u'', count, title, albumURL
        for x in xrange(1,count+1):
            suffix = ".shtml"
            if x > 1:
                suffix = "_"+str(x)+".shtml"
                # print u'',albumURL+suffix
                request = scrapy.Request(albumURL+suffix, callback=self.parse_item, cookies={'title': title})
                yield request

        l = ItemLoader(item=PageItem(), response=response)
        l.add_value('title', title)
        l.add_value('name', self.name)
        l.add_value('url', response.url)
        l.add_xpath('image_urls', '//td[@valign="top"]/img/@src')
        yield l.load_item()
eol.py 文件源码 项目:gaokao 作者: EasyData 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def parse(self, response):

        for outer in response.css('#comapreTable tr:not(:first-child)'):

            if outer.css('td[align="center"]'):
                ccode = outer.css('td[align="center"]>a::attr(id)').extract_first()
                cname = outer.css('td[align="center"]>a::text').extract_first()

            for inner in outer.xpath('td[div[@align="left"]/a]'):
                loader = ItemLoader(item=EolZhuanyeItem(), selector=inner)
                loader.add_value('ccode', ccode)
                loader.add_value('cname', cname)
                loader.add_css('url', 'a::attr(href)', lambda urls: urljoin(self.start_urls[0], urls[0]))
                loader.add_xpath('code', 'following-sibling::td[1]/text()', MapCompose(unicode.strip))
                loader.add_css('name', 'a::text', MapCompose(unicode.strip))
                item = loader.load_item()

                yield Request(url=item['url'][0], meta={'item': item}, callback=self.parse_item)
zhihu.py 文件源码 项目:ZhihuSpider 作者: ShayChris 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def parse_question(self, response):
        question_pattern = re.compile('(.*zhihu.com/question/(\d+))(/|$).*')
        match_object = re.match(question_pattern, response.url)
        question_id = match_object.group(2)
        item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
        item_loader.add_value('zhihu_id', question_id)
        item_loader.add_css('title', 'h1.QuestionHeader-title::text')
        item_loader.add_css('topics', '.TopicLink .Popover div::text')
        item_loader.add_value('url', response.url)
        item_loader.add_css('content', '.QuestionHeader-detail div div span::text')
        item_loader.add_css('answer_num', '.List-headerText span::text')
        item_loader.add_css('comments_num', '.QuestionHeader-Comment button::text')
        item_loader.add_css('watch_user_num', '.NumberBoard-value::text')

        item = item_loader.load_item()
        yield item
        yield scrapy.Request(self.start_answer_url.format(question_id=question_id, offset=0, limit=20),
                             headers=self.headers, callback=self.parse_answer)
MovieSpider.py 文件源码 项目:douban_movie_scrapy 作者: lanxing 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse(self, response):
        try:
            l = ItemLoader(item=MovieItem(), response=response)
            l.add_value('name',
                        response.css('div#content h1 [property="v:itemreviewed"]::text').extract_first().strip())
            year = response.css('div#content h1 span.year::text').extract_first()
            if year.startswith('('):
                year = year[1:-1]
            l.add_value('year', year)

            newStrL = []
            for val in response.css('div#info::text').extract():
                newStr = val.strip().strip('/')
                if newStr != '':
                    newStrL.append(newStr)
                    if len(newStrL) == 2:
                        break

            if len(newStrL) == 2:
                l.add_value('region', newStrL[0].split('/'))
                l.add_value('language', newStrL[1].split('/'))

            l.add_value('duration', response.css('div#info [property="v:runtime"]::attr(content)').extract_first())
            l.add_value('types', response.css('div#info [property="v:genre"]::text').extract())
            l.add_value('directors', response.css('div#info [rel="v:directedBy"]::text').extract())
            l.add_value('actors', response.css('div#info [rel="v:starring"]::text').extract())
            l.add_value('runtime', response.css('div#info [property="v:initialReleaseDate"]::text').extract())
            l.add_value('detailurl', response.url)
            l.add_value('IMDburl', response.css('div#info [rel="nofollow"]::attr(href)').extract())
            l.add_value('stars', response.css('strong[property="v:average"]::text').extract_first())
            return l.load_item()
        except Exception:
            pass
proc.py 文件源码 项目:EasyGoSpider 作者: Karmenzind 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def parse_item(self, response):
        url = response.url
        item_idx = self.all_urls[url]
        self.logger.info("Trying page %s %s" % (item_idx, url))

        resp_dct = json.loads(response.body)

        l = ItemLoader(item=HeatMapItem(), response=response)
        current_hour = time.strftime("%Y%m%d%H", time.localtime())
        l.add_value('cur_hour', current_hour)
        l.add_value('serial', item_idx)
        l.add_value('data', resp_dct.pop('data'))
        l.add_value('timestamp', resp_dct.pop('nt'))
        l.add_value('others', resp_dct)
        l.add_value('url', url)
        l.add_value('is_parsed', 0)

        self.finished.add(item_idx)
        self.logger.info(u"Crawling %s, %s successfully. :)" % (item_idx, url))
        self.claim_completeness()
        yield l.load_item()
        # else:
        #     if resp_dct.get("data") == "\\u8be5\\u7528\\u6237\\u8bbf\\u95ee\\u6b21\\u6570\\u8fc7\\u591a".decode(
        #             'unicode_escape'):  # ??????
        #         banned_cookie = response.request.cookies
        #         self.logger.warning("%s has been BANNED today." % banned_cookie)
        #         self.cookies.remove(banned_cookie)
        #         yield {"BannedCookieToday": banned_cookie}
        #     else:
        #         yield {}
        #     self.logger.error(u"Crawling %s, %s failed. :(" % (item_idx, response.url))
areas.py 文件源码 项目:scrapy-soccerway 作者: tvl 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse(self, response):
        l = ItemLoader(item=Area(), response=response)
        l.add_value('id', parse_qs(response.xpath('//div[@class="clearfix subnav level-1"]//li//a[2]/@href').extract()[0])['area_id'][0])
        l.add_xpath('name', '//div[@class="clearfix subnav level-1"]//li//a[2]/text()')
        l.add_value('updated', datetime.utcnow().isoformat()) # you can also use literal values
        return l.load_item()
        #self.log('URL: {}'.format(response.url))
myspider.py 文件源码 项目:scrapy_redis_splash_spider 作者: lymlhhj123 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def _extract_item(self, response):
        #???????
        l = ItemLoader(response=response, item=MyspiderItem(), type='html')
        l.add_xpath('movie_name', '//h1/span[@property="v:itemreviewed"]/text()')
        l.add_xpath('movie_year', '//span[@property="v:initialReleaseDate"]/text()')
        l.add_xpath('movie_type', '//span[@property="v:genre"]/text()')
        l.add_xpath('movie_rate', '//strong[@class="ll rating_num"]/text()')
        l.add_value('url', response.url)
        #????????????load_item()????scrapy.Item??
        #?scrapy-redis????json?item???????redis?item???
        #??json?????python?????????????item?????
        return dict(l.load_item())
lof.py 文件源码 项目:WSIL 作者: criticalerrors 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def get_details(self, response):
        self.log('Starting the second parsing phase')
        loader = ItemLoader(item=LibraryOrFrameworkItem(), response=response)

        # Load the values obtained in the first phase
        loader.add_value('name', response.meta['name'])

        language = response.meta['language']

        loader.add_value('stable_release', response.meta['stable_version'])
        loader.add_value('release_date', response.meta['rel_date'])


        descr = response.xpath('//*[@id="mw-content-text"]/div/p[1] | //*[@id="mw-content-text"]/p[1]').extract_first()
        cleaned_descr = cleanhtml(descr)
        loader.add_value('description', cleaned_descr)

        license_found = False
        for row in response\
                    .xpath('//*[@id="mw-content-text"]/div/table[position()<=3]/tr'):
            header = row.xpath('./th/a/text() | ./th/text()').extract_first()
            key, value = self.get_key_value(header, row)
            if key:
                if key == 'license': # If we find the license in the main page, we will use it
                    license_found = True
                loader.add_value(key, value)
        # If we not found the license in the main page
        # We will use the license found on the start page
        if not license_found:
            loader.add_value('license', response.meta['license'])

        return {
            "item": loader.load_item(),
            "language": language
            # We need to return the language separately in order to manage the many to many relation
        }

    # Given a couple (key, elem), obtained during the scraping, he returns the valid couple (key1, value1)
    # to add to the db. If key is not valid, he will return the tuple (None, None)
BookScraperXpath.py 文件源码 项目:scrapyfundamentals 作者: zseta 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse_book(self, response):
        book_loader = ItemLoader(item=BookItem(), response=response)
        book_loader.default_input_processor = MapCompose(remove_tags)
        book_loader.default_output_processor = TakeFirst()

        book_loader.add_xpath("title", "//div[@class='col-sm-6 product_main']/h1")
        book_loader.add_xpath("price", "//p[@class='price_color']")
        book_loader.add_xpath("upc", "//table[@class='table table-striped']/tr[1]/td")
        book_loader.add_xpath("product_type", "//table[@class='table table-striped']/tr[2]/td")
        book_loader.add_xpath("tax", "//table[@class='table table-striped']/tr[5]/td")
        book_loader.add_xpath("stock", "//table[@class='table table-striped']/tr[6]/td")
        book_loader.add_xpath("reviews", "//table[@class='table table-striped']/tr[7]/td")
        book_loader.add_xpath("rating", "//p[@class='instock availability']/following-sibling::p/@class")
        yield book_loader.load_item()
BookScraperCss.py 文件源码 项目:scrapyfundamentals 作者: zseta 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def parse_book(self, response):
        book_loader = ItemLoader(item=BookItem(), response=response)
        book_loader.default_input_processor = MapCompose(remove_tags)

        book_loader.add_value("image_urls", response.urljoin(response.css(".item.active > img::attr(src)").extract_first()))

        book_loader.add_css("title", ".col-sm-6.product_main > h1", TakeFirst())
        book_loader.add_css("price", ".price_color", TakeFirst())
        book_loader.add_css("upc", ".table.table-striped > tr:nth-child(1) > td", TakeFirst())
        book_loader.add_css("product_type", ".table.table-striped > tr:nth-child(2) > td", TakeFirst())
        book_loader.add_css("tax", ".table.table-striped > tr:nth-child(5) > td", TakeFirst())
        book_loader.add_css("stock", ".table.table-striped > tr:nth-child(6) > td", TakeFirst())
        book_loader.add_css("reviews", ".table.table-striped > tr:nth-child(7) > td", TakeFirst())
        book_loader.add_css("rating", ".star-rating::attr(class)", TakeFirst())
        return book_loader.load_item()
SampleSpider.py 文件源码 项目:scrapyfundamentals 作者: zseta 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def parse(self, response):
        for quote in response.css(".quote"):
            loader = ItemLoader(item=QuoteItem(), selector=quote)

            loader.add_css("text", ".text")
            loader.add_css("by", ".authoor")
            loader.add_css("tags", ".tag")
            yield loader.load_item()
countryspider.py 文件源码 项目:scrapyfundamentals 作者: zseta 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def parse(self, response):

        for country in response.css(".col-md-4, .country"):
            item = ItemLoader(item=CountryItem(), selector=country)

            item.add_css("country", ".country-name")
            item.add_css("capital", ".country-capital::text")
            item.add_css("population", ".country-population::text")
            item.add_css("area", ".country-area::text")

            yield item.load_item()
lianjia_bj_zufang.py 文件源码 项目:Crawler-Of-Lianjia 作者: tonywangcn 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def parse(self, response):
        #l = ItemLoader(item = LianjiaItem(),response=response)
        for i in range(0,len(response.xpath("//div[@class='info-panel']/h2/a/text()").extract())):
            l = ItemLoader(item = LianjiaItem(),response=response)
            info = response.xpath("//div[@class='info-panel']/h2/a/text()").extract()[i].encode('utf-8')
            local = response.xpath("//div[@class='info-panel']").xpath(".//span[@class='region']/text()").extract()[i].encode('utf-8')
            house_layout = response.xpath("//div[@class='info-panel']").xpath(".//span[@class='zone']//text()").extract()[i].encode('utf-8')
            house_square = response.xpath("//div[@class='info-panel']").xpath(".//span[@class='meters']/text()").extract()[i].encode('utf-8')
            house_orientation = response.xpath("//div[@class='info-panel']").xpath(".//div[@class='where']//span/text()").extract()[(i + 1) * 4 - 1].encode('utf-8')
            district = response.xpath("//div[@class='info-panel']").xpath(".//div[@class='con']/a/text()").extract()[i].encode('utf-8')[:-6]
            floor = response.xpath("//div[@class='info-panel']").xpath(".//div[@class='con']//text()").extract()[(i + 1) * 5 - 3].encode('utf-8')
            building_year = response.xpath("//div[@class='info-panel']").xpath(".//div[@class='con']//text()").extract()[(i + 1) * 5 - 1].encode('utf-8')
            price_month = response.xpath("//div[@class='info-panel']").xpath(".//span[@class='num']//text()").extract()[(i + 1) * 2 - 2].encode('utf-8')
            person_views = response.xpath("//div[@class='info-panel']").xpath(".//span[@class='num']//text()").extract()[(i + 1) * 2 - 1].encode('utf-8')
            tags = []
            for j in range(0,len(response.xpath("//div[@class='view-label left']")[i].xpath(".//span//text()").extract())):
                tags.append(response.xpath("//div[@class='view-label left']")[i].xpath(".//span//text()").extract()[j].encode("utf-8"))
            l.add_value('info',info)
            l.add_value('local',local)
            l.add_value('house_layout',house_layout)
            l.add_value('house_square',house_square)
            l.add_value('house_orientation',house_orientation)
            l.add_value('district',district)
            l.add_value('floor',floor)
            l.add_value('building_year',building_year)
            l.add_value('price_month',price_month)
            l.add_value('person_views',person_views)
            l.add_value('tags',tags)
            print l
            yield l.load_item()
lianjia_ershou.py 文件源码 项目:Crawler-Of-Lianjia 作者: tonywangcn 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def parse(self, response):
#       l = ItemLoader(item = ItjuziItem(),response=response)
        jsonresponse = json.loads(response.body_as_unicode())
        for i in range(0,len(jsonresponse['data']['list'])):
            l = ItemLoader(item = LianjiaErshouItem(),response=response)
            house_code = jsonresponse['data']['list'][i]['house_code']
            price_total = jsonresponse['data']['list'][i]['price_total']
            ctime = jsonresponse['data']['list'][i]['ctime']
            title = jsonresponse['data']['list'][i]['title']
            frame_hall_num = jsonresponse['data']['list'][i]['frame_hall_num']
            tags = jsonresponse['data']['list'][i]['tags']
            house_area = jsonresponse['data']['list'][i]['house_area']
            community_id = jsonresponse['data']['list'][i]['community_id']
            community_name = jsonresponse['data']['list'][i]['community_name']
            is_two_five = jsonresponse['data']['list'][i]['is_two_five']
            frame_bedroom_num = jsonresponse['data']['list'][i]['frame_bedroom_num']
            l.add_value('house_code',house_code)
            l.add_value('price_total',price_total)
            l.add_value('ctime',ctime)
            l.add_value('title',title)
            l.add_value('frame_hall_num',frame_hall_num)
            l.add_value('tags',tags)
            l.add_value('house_area',house_area)
            l.add_value('community_id',community_id)
            l.add_value('community_name',community_name)
            l.add_value('is_two_five',is_two_five)
            l.add_value('frame_bedroom_num',frame_bedroom_num)
            print l
            yield l.load_item()
zhihu.py 文件源码 项目:fintech_spider 作者: hee0624 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def parse_question(self, response):
        #??question??? ??????????question item
        if "QuestionHeader-title" in response.text:
            #?????
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num", ".QuestionHeader-actions button::text")
            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")

            question_item = item_loader.load_item()
        else:
            #????????item??
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            # item_loader.add_css("title", ".zh-question-title h2 a::text")
            item_loader.add_xpath("title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
            # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
            item_loader.add_xpath("watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

            question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer)
        yield question_item
items.py 文件源码 项目:FirstSpider 作者: yipwinghong 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def join_tags(value):
    return ','.join([i for i in value if i])


# ??ItemLoader????output_processor??
# ItemLoader????list??????????????????output_processor
imgspider.py 文件源码 项目:ScrapyImage 作者: donnki 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def parse_page(self, response):
        #????
        # print u'~~~~', unicode(response.body, "gbk").encode("utf8")
        # print(self.config["xpathImagesPath"])
        # print(response.xpath(self.config["xpathImagesPath"]))
        l = ItemLoader(item=PageItem(), response=response)
        l.add_value('title', response.request.cookies['title'])
        l.add_value('name', self.config["id"])
        l.add_value('url', response.url)
        if self.config.has_key("imageUrlReplacement"):
            l.add_value('replace', self.config["imageUrlReplacement"])

        if self.config.has_key("xpathImagesPath"):
            l.add_xpath('image_urls', self.config["xpathImagesPath"])
        if self.config.has_key("xpathFilesPath"):
            l.add_xpath('file_urls', self.config["xpathFilesPath"])
        yield l.load_item()

        #TODO??????????????parse_page
        if self.config.has_key("xpathNextImageUrl"):
            nextUrls = response.xpath(self.config["xpathNextImageUrl"])
            if len(nextUrls) > 0:
                nextPage = nextUrls.extract()[0]
                if not nextPage.startswith("http"):
                    if nextPage.startswith("/"):
                        nextPage = response.url[0:response.url.index("/",10)+1]+nextPage 
                    else:
                        nextPage = response.url[0:response.url.rfind("/")+1]+nextPage 
                request = scrapy.Request(nextPage, callback=self.parse_page, cookies={'title': response.request.cookies['title']})
                yield request
com5442.py 文件源码 项目:ScrapyImage 作者: donnki 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse_item(self, response):
        l = ItemLoader(item=PageItem(), response=response)
        l.add_value('title', response.request.cookies['title'])
        l.add_value('name', self.name)
        l.add_value('url', response.url)
        l.add_xpath('image_urls', '//p[@id="contents"]/a/img/@src')
        return l.load_item()
tuba77.py 文件源码 项目:ScrapyImage 作者: donnki 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse_item(self, response):
        l = ItemLoader(item=PageItem(), response=response)
        l.add_value('title', response.request.cookies['title'])
        l.add_value('name', self.name)
        l.add_value('url', response.url)
        l.add_xpath('image_urls', '//td[@valign="top"]/img/@src')
        return l.load_item()
gaokaopai.py 文件源码 项目:gaokao 作者: EasyData 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def parse_item(self, response):

        loader = ItemLoader(GaokaopaiZhiyeItem(), response)
        loader.add_value('url', response.url)
        loader.add_value('code', response.url, re=ur'-([^-]+)\.html')
        loader.add_css('name', u'.modTitle>h1::text')

        def parse_category():
            for e in response.css(u'.catType>a'):
                yield {
                    'url': e.css('::attr(href)').extract_first(),
                    'code': e.css('::attr(href)').re_first(ur'-([^-]+)\.html'),
                    'name': e.css('::text').extract_first(),
                }

        loader.add_value('category', list(parse_category()))
        loader.add_css('detail', u'.zhiyeShow')

        item = loader.load_item()

        return FormRequest(
            url='http://www.gaokaopai.com/ajax-career-getRelateMajor.html',
            formdata={'code': item['code'][0]},
            meta={'item': item},
            dont_filter=True,
            callback=self.parse_majors
        )
eol.py 文件源码 项目:gaokao 作者: EasyData 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def parse_item(self, response):

        loader = ItemLoader(EolZhiyeItem(), response)
        loader.add_value('url', response.url)
        loader.add_value('code', response.url, re=r'/(\w+)\.shtml')
        loader.add_css('name', 'h1#pagetitle::text')
        loader.add_xpath('category', u'//div[@id="precontent"]/p[contains(., "??")]/a/text()')
        loader.add_xpath('category2', u'//div[@id="precontent"]/p[contains(., "??")]/a/text()')
        loader.add_xpath('detail', u'//div[@id="precontent"]/following-sibling::node()[not(self::table)]', Join('\n'))
        yield loader.load_item()
zhihu.py 文件源码 项目:Charlotte 作者: LiZoRN 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def parse_question(self, response):
        #??question??? ??????????question item
        if "QuestionHeader-title" in response.text:
            #?????
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num", ".QuestionHeader-actions button::text")
            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")

            question_item = item_loader.load_item()
        else:
            #????????item??
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            if match_obj:
                question_id = int(match_obj.group(2))

            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            # item_loader.add_css("title", ".zh-question-title h2 a::text")
            item_loader.add_xpath("title", "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
            # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
            item_loader.add_xpath("watch_user_num", "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")

            question_item = item_loader.load_item()

        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer)
        yield question_item
plants_loader.py 文件源码 项目:planty 作者: agnaite 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse(self, response):
        l = ItemLoader(item=PlantItem(), response=response)

        l.add_xpath('name', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/h2/text()")
        l.add_xpath('species', "//div[@id='bodycontent']/div[@class='post']/div[@class='pagebanner']/div[@class='clear resultSpecies']/text()")
        l.add_xpath('key', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-key']/text()")
        l.add_xpath('value', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/child::node()")
        # l.add_xpath('value', "//div[@id='bodycontent']/div[@class='post']/div[@class='contents']/div[@id='tabbedinfo']/div[@class='tabscontain']/div[@class='tabs']/div[@class='post-meta']/div[@class='post-meta-value']/a/text()")

        return l.load_item()
rosi.py 文件源码 项目:crawlers 作者: evilcos 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def parse_item(self, response):
        il = ItemLoader(item=ImageItem(), response=response)
        il.add_css('image_urls', 'img::attr(src)')
        return il.load_item()
Zhihu.py 文件源码 项目:zhihu_spider 作者: pujinxiao 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def parse_question(self,response):
        # ??question?????????????question item
        if "QuestionHeader-title" in response.text:
            # ?????
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            if match_obj:
                question_id = int(match_obj.group(2))
            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            item_loader.add_css("title", "h1.QuestionHeader-title::text")
            item_loader.add_css("content", ".QuestionHeader-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", ".List-headerText span::text")
            item_loader.add_css("comments_num", ".QuestionHeader-actions button::text")
            item_loader.add_css("watch_user_num", ".NumberBoard-value::text")
            item_loader.add_css("topics", ".QuestionHeader-topics .Popover div::text")
            question_item = item_loader.load_item()
        else:
            # ????????item??
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", response.url)
            if match_obj:
                question_id = int(match_obj.group(2))
            item_loader = ItemLoader(item=ZhihuQuestionItem(), response=response)
            # item_loader.add_css("title", ".zh-question-title h2 a::text")
            item_loader.add_xpath("title",
                                  "//*[@id='zh-question-title']/h2/a/text()|//*[@id='zh-question-title']/h2/span/text()")
            item_loader.add_css("content", "#zh-question-detail")
            item_loader.add_value("url", response.url)
            item_loader.add_value("zhihu_id", question_id)
            item_loader.add_css("answer_num", "#zh-question-answer-num::text")
            item_loader.add_css("comments_num", "#zh-question-meta-wrap a[name='addcomment']::text")
            # item_loader.add_css("watch_user_num", "#zh-question-side-header-wrap::text")
            item_loader.add_xpath("watch_user_num",
                                  "//*[@id='zh-question-side-header-wrap']/text()|//*[@class='zh-question-followers-sidebar']/div/a/strong/text()")
            item_loader.add_css("topics", ".zm-tag-editor-labels a::text")
            question_item = item_loader.load_item()
        yield scrapy.Request(self.start_answer_url.format(question_id, 20, 0), headers=self.headers, callback=self.parse_answer) #???question_id?????
        yield question_item
playlist.py 文件源码 项目:cloudmusic_api 作者: yqh231 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def parse_song_list(self, response):
        selector = Selector(response)

        song_name_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/text()').extract()
        song_id_list = selector.xpath('//body//ul[@class="f-hide"]/li/a/@href').extract()
        title = selector.xpath('//title/text()').extract()
        for index, id_ in enumerate(song_id_list):
            l = ItemLoader(item=PlayListItem())
            l.add_value('song_name', song_name_list[index])
            l.add_value('title', title)
            yield scrapy.FormRequest(url=self.BASE_URL + id_, meta={'song_id': id_[9:], 'loader': l}, method='GET',
                                     headers=self.headers, callback=self.parse_single_song)


问题


面经


文章

微信
公众号

扫码关注公众号