python类Request()的实例源码

reference_news_spider.py 文件源码 项目:Spider_cust_news 作者: sensen58588 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def parse_detail(self, response):
        content = response.css('#work span::text').extract()
        reg = "^(http|https|ftp)://.*(.com|.cn|.html|.htm|.asp|.jsp)"
        url = response.url
        reg_url_name = ".*?(\d+)"
        get_url = re.match(reg_url_name, url)
        if get_url:
            self.get_name = get_url.group(1)
        reference_url_list = []
        for each_line in content:
            get_reference_url = re.match(reg, each_line)
            if get_reference_url:
                reference_url_list.append(get_reference_url.group(0))
        self.count = 0
        if reference_url_list:
            for each_url in reference_url_list:
                yield Request(url=each_url, dont_filter=True, callback=self.parse_reference)
                self.count += 1
        else:
            pass
163_spider.py 文件源码 项目:mongodb_project 作者: Lovecanon 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def post_get_playlist(self, response):
        collection = self.db.playlist
        result = json.loads(response.body, encoding='utf-8')['result']

        # inserted = collection.update({'id': result['id']}, result, upsert=True)  # upsert=True??insert or update
        # logger.info('Update or Insert to playlist database[%s]' % (str(inserted),))
        if result['id'] not in self.playlist_id_buffer:
            collection.insert(result)

        for song in result['tracks']:
            artists = []
            for detail in song['artists']:
                artists.append(detail['name'])
            comment_url = 'http://music.163.com/weapi/v1/resource/comments/%s/?csrf_token=' % (song['commentThreadId'],)
            # ??FormRequest???POST??????????????
            # Request(url, method='POST', body=json.dumps(data))
            yield FormRequest(comment_url, formdata=self.post_data, callback=self.parse,
                              meta={'m_id': song['id'], 'm_name': song['name'], 'artists': artists})
mogujie.py 文件源码 项目:first-crawler 作者: Xinghaoz 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def parse_list(self, response):
        url = response.meta['splash']['args']['url']
        pattern = re.compile(r'http://www.mogujie.com/book/\w+/\d+/')

        if (pattern.match(url)):
            page = int(pattern.split(url)[1])
            url = pattern.findall(url)[0]
            page += 1
            url = url + str(page)
        else:
            url = url + '/2'

        print '+++++++++++++++++++++++++ Next url:', url
        req = SplashRequest(url = url, callback = self.parse_list)
        yield req

        pattern_detail = re.compile(r'http://shop.mogujie.com/detail/.{7}')
        for item_url in pattern_detail.findall(response.body):
            req = Request(url = item_url, callback = self.parse_item)
            yield req
serialize.py 文件源码 项目:scrappy 作者: DormyMo 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def default(self, o):
        if isinstance(o, datetime.datetime):
            return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT))
        elif isinstance(o, datetime.date):
            return o.strftime(self.DATE_FORMAT)
        elif isinstance(o, datetime.time):
            return o.strftime(self.TIME_FORMAT)
        elif isinstance(o, decimal.Decimal):
            return str(o)
        elif isinstance(o, defer.Deferred):
            return str(o)
        elif isinstance(o, BaseItem):
            return dict(o)
        elif isinstance(o, Request):
            return "<%s %s %s>" % (type(o).__name__, o.method, o.url)
        elif isinstance(o, Response):
            return "<%s %s %s>" % (type(o).__name__, o.status, o.url)
        elif isinstance(o, Crawler):
            return o.stats.get_stats()
        else:
            return super(ScrapyJSONEncoder, self).default(o)
anchor_web.py 文件源码 项目:TvLive 作者: Rano1 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def get_all_page(self, response):
        all_page = 0  # ???
        current_page = 1  # ????
        body = str(response.body)
        regex_str = ".*?PAGE.pager = ({.*?});.*"
        pager = re.match(regex_str, body)
        if pager:
            pager_data = pager.group(1).replace('\\n', '').replace('\\r', '').replace(" ", "")
            regex_str = '.*count:"(\d+)".*'
            all_page = int(re.match(regex_str, pager_data).group(1))
        print("all_page :" + str(all_page))
        # ????????scrapy????
        while current_page <= all_page:
            url = apiconstants.get_douyu_list_url(current_page)
            print(url)
            current_page = current_page + 1
            yield Request(url=url, callback=self.parse)
        print("????")
anchor.py 文件源码 项目:TvLive 作者: Rano1 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def get_all_page(self, response):
        all_page = 0  # ???
        current_page = 1  # ????
        body = str(response.body)
        regex_str = ".*?PAGE.pager = ({.*?});.*"
        pager = re.match(regex_str, body)
        if pager:
            pager_data = pager.group(1).replace('\\n', '').replace('\\r', '').replace(" ", "")
            regex_str = '.*count:"(\d+)".*'
            all_page = int(re.match(regex_str, pager_data).group(1))
        print("all_page :" + str(all_page))
        # ????????scrapy????
        while current_page <= all_page:
            url = apiconstants.get_douyu_list_url(current_page)
            print(url)
            current_page = current_page + 1
            yield Request(url=url, callback=self.parse)
        print("????")
spider.py 文件源码 项目:cl1024 作者: wuchujiecode 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def get_torrent(self, response):
        sel = Selector(response)
        cl_title = sel.xpath('//td[@class="h"]/text()[2]').extract_first()
        cl_bankuai = sel.xpath('//div[@class="t3"]/table/tr/td/b/a[2]/text()').extract_first()
        cl_url = response.url
        torrent = re.search('rmdown\.com(.+?)</a>', response.body)
        torrent_url = 'http://www.' + torrent.group()[:-4]
        posted = sel.xpath('//div[@class="tipad"]/text()').extract()[1]
        posted = posted.encode('utf-8')[9:-7]
        yield Request(
            url=torrent_url,
            meta={
                'cl_title': cl_title,
                'cl_bankuai': cl_bankuai,
                'cl_url': cl_url,
                'posted': posted,
            },
            callback=self.parse_item,
            dont_filter=True)
crawlpy_spider.py 文件源码 项目:crawlpy 作者: cytopia 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def init_request(self):
        """This function is called before crawling starts."""

        # Do not start a request on error,
        # simply return nothing and quit scrapy
        if self.abort:
            return

        logging.info('All set, start crawling with depth: ' + str(self.max_depth))

        # Do a login
        if self.config['login']['enabled']:
            # Start with login first
            logging.info('Login required')
            return Request(url=self.login_url, callback=self.login)
        else:
            # Start with pase function
            logging.info('Not login required')
            return Request(url=self.base_url, callback=self.parse)



    #----------------------------------------------------------------------
ZhihuSpider.py 文件源码 项目:zhihu_spider 作者: dengqiangxi 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def parse_followers(self, response):
        nametoken = response.meta['nametoken']
        api_followees_url = self.base_url + '/api/v4/members/' + response.url.split('/')[-2] + '/followees'
        api_followers_url = self.base_url + '/api/v4/members/' + response.url.split('/')[-2] + '/followers'

        yield scrapy.Request(url=api_followees_url, callback=self.parser_follow_json, headers=ZHIHU_HEADER,
                             cookies=ZHIHU_COOKIE, meta={
                'nametoken': nametoken
            })
        yield scrapy.Request(url=api_followers_url, callback=self.parser_follow_json, headers=ZHIHU_HEADER,
                             cookies=ZHIHU_COOKIE, meta={
                'nametoken': nametoken
            })




    # ??json
DouBan.py 文件源码 项目:Spider 作者: iamyaojie 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def parse(self, response):

        item = DoubanspiderItem()
        selector = Selector(response)
        Movies = selector.xpath('//div[@class="info"]')
        for eachMovie in Movies:
            title = eachMovie.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract()
            movieInfo = eachMovie.xpath('div[@class="bd"]/p/text()').extract()
            star = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()
            quote = eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()

            item['title'] = title
            item['movieInfo'] = ';'.join(movieInfo)
            item['star'] = star
            item['quote'] = quote
            # ??item
            yield item
        nextLink = selector.xpath('//span[@class="next"]/link/@href').extract()
        if nextLink:
            nextLink = nextLink[0]
            print(nextLink)
            yield Request(self.url + nextLink,callback=self.parse)
proxy_spider.py 文件源码 项目:ProxyPool 作者: Time1ess 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def parse(self, response):
        for i in range(10):
            self.current += 1
            if self.current >= self.rule.max_page:
                break
            yield Request(self.rule.url_fmt.format(self.current))
        if response.status != 200:
            return None
        ip_list = response.xpath(self.rule.row_xpath)[1:]
        for ip_item in ip_list:
            l = ProxyItemLoader(item=ProxyItem(), selector=ip_item)
            l.add_xpath('proxy', self.rule.host_xpath)
            l.add_xpath('proxy', self.rule.port_xpath)
            l.add_xpath('ip', self.rule.host_xpath)
            l.add_xpath('port', self.rule.port_xpath)
            l.add_xpath('addr', self.rule.addr_xpath)
            l.add_xpath('mode', self.rule.mode_xpath)
            l.add_xpath('protocol', self.rule.proto_xpath)
            l.add_xpath('validation_time', self.rule.vt_xpath)
            l.add_value('src_rule', self.rule.name)
            yield l.load_item()
pornHubSpider.py 文件源码 项目:WebHubBot 作者: xiyouMc 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse_ph_key(self, response):
        selector = Selector(response)
        logging.debug('request url:------>' + response.url)
        # logging.info(selector)
        divs = selector.xpath('//div[@class="phimage"]')
        for div in divs:
            viewkey = re.findall('viewkey=(.*?)"', div.extract())
            # logging.debug(viewkey)
            yield Request(url='https://www.pornhub.com/embed/%s' % viewkey[0],
                          callback=self.parse_ph_info)
        url_next = selector.xpath(
            '//a[@class="orangeButton" and text()="Next "]/@href').extract()
        logging.debug(url_next)
        if url_next:
            # if self.test:
            logging.debug(' next page:---------->' + self.host + url_next[0])
            yield Request(url=self.host + url_next[0],
                          callback=self.parse_ph_key)
            # self.test = False
my_news_spider.py 文件源码 项目:Hanhan_NLP 作者: hanhanwu 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def parse_articles(self, response):
        article_ptn = "http://www.theglobeandmail.com/opinion/(.*?)/article(\d+)/"
        resp_url = response.url
        article_m = re.match(article_ptn, resp_url)
        article_id = ''
        if article_m != None:
            article_id = article_m.group(2)

        if article_id == '32753320':
            print('***URL***', resp_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            text = Selector(text=response.text).xpath('//*[@id="content"]/div[1]/article/div/div[3]/div[2]').extract()


            if text:
                print("*****in Spider text*****", soup.title.string)
                yield {article_id: {"title": soup.title.string, "link": resp_url, "article_text": text}}
                comments_link = response.url + r'comments/'
                if comments_link == 'http://www.theglobeandmail.com/opinion/a-fascists-win-americas-moral-loss/article32753320/comments/':
                    yield Request(comments_link, callback=self.parse_comments)
neteasemusicspider.py 文件源码 项目:NetEaseMusicCrawler 作者: yaochao 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse_follows(self, response):
        ''' parse the follows '''
        url = response.url
        _id = url.split('=')[-1]
        item = response.meta['item']
        driver = response.meta['driver']
        try:
            driver.switch_to.default_content()
            g_iframe = driver.find_elements_by_tag_name('iframe')[0]
            driver.switch_to.frame(g_iframe)
            lis = driver.find_elements_by_xpath('//*[@id="main-box"]/li')
            follows = {}
            for li in lis:
                a = li.find_element_by_tag_name('a')
                title = a.get_attribute('title')
                href = a.get_attribute('href')
                uid = href.split('=')[-1]
                follows[uid] = title
            item['follows'] = follows
        except Exception as e:
            item['follows'] = None
            print e

        # driver.close()
        request = Request(url='http://music.163.com/user/fans?id=' + _id, callback=self.parse_fans)
        request.meta['item'] = copy.deepcopy(item)
        yield request

    # TODO: ??
neteasemusicspider.py 文件源码 项目:NetEaseMusicCrawler 作者: yaochao 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse_fans(self, response):
        ''' parse the follows '''
        url = response.url
        _id = url.split('=')[-1]
        item = response.meta['item']
        driver = response.meta['driver']
        try:
            driver.switch_to.default_content()
            g_iframe = driver.find_elements_by_tag_name('iframe')[0]
            driver.switch_to.frame(g_iframe)
            lis = driver.find_elements_by_xpath('//*[@id="main-box"]/li')
            fans = {}
            for li in lis:
                a = li.find_element_by_tag_name('a')
                title = a.get_attribute('title')
                href = a.get_attribute('href')
                uid = href.split('=')[-1]
                fans[uid] = title
            item['fans'] = fans
        except Exception as e:
            item['fans'] = None
            print e

        # driver.close()
        request = Request(url='http://music.163.com/user/songs/rank?id=' + _id, callback=self.parse_songs_rank)
        request.meta['item'] = copy.deepcopy(item)
        yield request
sobaidupan.py 文件源码 项目:sbdspider 作者: onecer 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def start_requests(self):
        for u in self.start_urls:
            yield Request(u,callback=self.parse,
                                    errback=self.errback)
sobaidupan.py 文件源码 项目:sbdspider 作者: onecer 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def parse(self, response):
        yield self.parse_item(response)
        for a in response.css('a::attr(href)').extract():
            if not a:
                continue
            next_url = response.urljoin(a)
            yield Request(next_url,callback=self.parse)
queue.py 文件源码 项目:sbdspider 作者: onecer 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def pop(self, timeout=0):
        """Pop a request"""
        if timeout > 0:
            data = self.server.brpop(self.key, timeout=timeout)
            if isinstance(data, tuple):
                data = data[1]
        else:
            data = self.server.rpop(self.key)
        if data:
            cb, url = data.split('--', 1)
            try:
                cb = getattr(self.spider, str(cb))
                return Request(url=url, callback=cb)
            except AttributeError:
                raise ValueError("Method %r not found in: %s" % (cb, self.spider))
Acfun_article.py 文件源码 项目:Acfun_article_spider 作者: bbbbx 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def parse(self, response):
        article_nodes = response.css('#block-content-article .mainer .item a.title')

        for article_node in article_nodes:
            article_url = urlparse.urljoin(response.url, str(article_node.css("::attr(href)").extract_first(
                "")))  # "http://www.acfun.cn" + str(article_node.css("::attr(href)").extract_first(""))
            yield Request(url=article_url, callback=self.parse_detail, dont_filter=True)

        next_nodes = response.css(".pager")
        next_node = next_nodes[len(next_nodes) - 1]
        next_url = str(next_node.css("::attr(href)").extract_first(""))
        if next_url:
            next_url = urlparse.urljoin(response.url, next_url)
            yield Request(url=next_url, callback=self.parse, dont_filter=True)
peuland.py 文件源码 项目:rental 作者: meihuanyu 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def start_requests(self):
        for i, url in enumerate(self.urls):
            yield Request(
                    url = url,
                    headers = self.headers,
                    meta = self.meta,
                    dont_filter = True,
                    callback = self.parse_page,
                    errback = self.error_parse,
            )


问题


面经


文章

微信
公众号

扫码关注公众号