python类Request()的实例源码-第2页-面圈网

SolutionSpider.py 文件源码项目：codeforces-crawler 作者: Nymphet 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def parse(self, response):
        for solution_href in response.selector.xpath('//a[@title="Participants solved the problem"]/@href'):
            solution_url = response.urljoin(
                solution_href.extract() + '?order=BY_CONSUMED_TIME_ASC')
            yield scrapy.Request(solution_url, callback=self.parse_problem_solution_list_page)

        if response.selector.xpath('//span[@class="inactive"]/text()').extract():
            if response.selector.xpath('//span[@class="inactive"]/text()')[0].extract() != u'\u2192':
                next_page_href = response.selector.xpath(
                    '//div[@class="pagination"]/ul/li/a[@class="arrow"]/@href')[0]
                next_page_url = response.urljoin(next_page_href.extract())
                yield scrapy.Request(next_page_url, callback=self.parse)
        else:
            next_page_href = response.selector.xpath(
                '//div[@class="pagination"]/ul/li/a[@class="arrow"]/@href')[1]
            next_page_url = response.urljoin(next_page_href.extract())
            yield scrapy.Request(next_page_url, callback=self.parse)

TestDataSpider.py 文件源码项目：codeforces-crawler 作者: Nymphet 项目源码文件源码阅读 21 收藏 0 点赞 0 评论 0

def parse(self, response):
        for solution_href in response.selector.xpath('//a[@title="Participants solved the problem"]/@href'):
            solution_url = response.urljoin(
                solution_href.extract() + '?order=BY_CONSUMED_TIME_ASC')
            yield scrapy.Request(solution_url, callback=self.parse_problem_solution_list_page)

        if response.selector.xpath('//span[@class="inactive"]/text()').extract():
            if response.selector.xpath('//span[@class="inactive"]/text()')[0].extract() != u'\u2192':
                next_page_href = response.selector.xpath(
                    '//div[@class="pagination"]/ul/li/a[@class="arrow"]/@href')[0]
                next_page_url = response.urljoin(next_page_href.extract())
                yield scrapy.Request(next_page_url, callback=self.parse)
        else:
            next_page_href = response.selector.xpath(
                '//div[@class="pagination"]/ul/li/a[@class="arrow"]/@href')[1]
            next_page_url = response.urljoin(next_page_href.extract())
            yield scrapy.Request(next_page_url, callback=self.parse)

diputados_scrapy.py 文件源码项目：scraping-python 作者: python-madrid 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def parse_lista_diputados(self, response):
        # listado de diputados
        diputados = response.xpath(
            '//div[@class="listado_1"]/ul/li/a/@href').extract()
        for diputado in diputados:
            request = scrapy.Request(
                response.urljoin(diputado),
                callback=self.parse_diputado)
            yield request

        # proxima pagina
        pagina_siguiente = response.xpath(
            '//a[contains(., "Página Siguiente")]/@href').extract_first()
        if pagina_siguiente:
            request = scrapy.Request(
                pagina_siguiente,
                callback=self.parse_lista_diputados)
            yield request

zhihu.py 文件源码项目：spider 作者: pythonsite 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def parse_user(self, response):
        '''
        ??????json??????????????json.loads????
        :param response:
        :return:
        '''
        result = json.loads(response.text)
        item = UserItem()
        #?????????????????????????????
        for field in item.fields:
            if field in result.keys():
                item[field] = result.get(field)

        #?????item?????Request??????????????????????????
        yield item
        yield Request(self.follows_url.format(user = result.get("url_token"),include=self.follows_query,offset=0,limit=20),callback=self.parse_follows)
        yield Request(self.followers_url.format(user = result.get("url_token"),include=self.followers_query,offset=0,limit=20),callback=self.parse_followers)

zhihu.py 文件源码项目：spider 作者: pythonsite 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def parse_follows(self, response):
        '''
        ?????????????????json?? ???????data?page???page?????
        :param response:
        :return:
        '''
        results = json.loads(response.text)

        if 'data' in results.keys():
            for result in results.get('data'):
                yield Request(self.user_url.format(user = result.get("url_token"),include=self.user_query),callback=self.parse_user)

        #????page????????page????is_end?????False????False?????????????????
        if 'page' in results.keys() and results.get('is_end') == False:
            next_page = results.get('paging').get("next")
            #????????????yield????Request???????????????????
            yield Request(next_page,self.parse_follows)

zhihu.py 文件源码项目：spider 作者: pythonsite 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def parse_followers(self, response):
        '''
        ??????????????????
        ?????????????????json?? ???????data?page???page?????
        :param response:
        :return:
        '''
        results = json.loads(response.text)

        if 'data' in results.keys():
            for result in results.get('data'):
                yield Request(self.user_url.format(user = result.get("url_token"),include=self.user_query),callback=self.parse_user)

        #????page????????page????is_end?????False????False?????????????????
        if 'page' in results.keys() and results.get('is_end') == False:
            next_page = results.get('paging').get("next")
            #????????????yield????Request???????????????????
            yield Request(next_page,self.parse_followers)

zhihu.py 文件源码项目：ArticleSpider 作者: mtianyan 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def parse(self, response):
        """
                ???html??????url ?????url??????
                ?????url???? /question/xxx ?????????????
                """
        all_urls = response.css("a::attr(href)").extract()
        all_urls = [parse.urljoin(response.url, url) for url in all_urls]
        # ??lambda???????url????????true???????false???
        all_urls = filter(lambda x: True if x.startswith("https") else False, all_urls)
        for url in all_urls:
            match_obj = re.match("(.*zhihu.com/question/(\d+))(/|$).*", url)
            if match_obj:
                # ?????question???????????????????
                request_url = match_obj.group(1)
                yield scrapy.Request(request_url, headers=self.headers, callback=self.parse_question)
                #??
                # break
            else:
                # pass
                # ????question??????????
                yield scrapy.Request(url, headers=self.headers, callback=self.parse)

zhihu.py 文件源码项目：ArticleSpider 作者: mtianyan 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def start_requests(self):
        return [scrapy.Request('https://www.zhihu.com/#signin', headers=self.headers, callback=self.login)]

zhihu.py 文件源码项目：ArticleSpider 作者: mtianyan 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def login(self, response):
        response_text = response.text
        match_obj = re.match('.*name="_xsrf" value="(.*?)"', response_text, re.DOTALL)
        xsrf = ''
        if match_obj:
            xsrf = (match_obj.group(1))

        if xsrf:
            post_url = "https://www.zhihu.com/login/phone_num"
            post_data = {
                "_xsrf": xsrf,
                "phone_num": "18487255487",
                "password": "ty158917",
                "captcha": ""
            }

            import time
            t = str(int(time.time() * 1000))
            captcha_url = "https://www.zhihu.com/captcha.gif?r={0}&type=login".format(t)
            yield scrapy.Request(captcha_url, headers=self.headers, meta={"post_data":post_data}, callback=self.login_after_captcha)

BookSpider.py 文件源码项目：crawl_web 作者: hanxlinsist 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def parse(self, response):
        links = response.xpath("//*[@class = 'tagCol']/descendant::a/@href").extract()
        for href in links:
            for pageNum in np.linspace(0, 180, 10): # ????Tag??10???
                full_url = response.urljoin(href + "/?start=" + str(int(pageNum)) + "&type=S") # ?type=S  ????????
                yield scrapy.Request(full_url, callback=self.parse_tag_per_page)

    # ???????????????

BookSpider.py 文件源码项目：crawl_web 作者: hanxlinsist 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def parse_tag_per_page(self, response):
        links = response.xpath("//ul[@class = 'subject-list']/descendant::a[@class = 'nbg']/@href").extract()
        for book in links:
            yield scrapy.Request(book, callback=self.parse_book)

    # ???????????????BookItem

suning.py 文件源码项目：lichking 作者: melonrun 项目源码文件源码阅读 86 收藏 0 点赞 0 评论 0

def start_requests(self):
        for part_url in self.start_urls:
            yield scrapy.Request(
                part_url,
                meta={"page_key": 0},
                callback=self.generate_productlist
            )

    # ?? ??????????100???

suning.py 文件源码项目：lichking 作者: melonrun 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def generate_product_detail(self, response):
        product_id1 = re.search('/([\d]+)/', response.url).group(1)
        product_id2 = re.search('/([\d]+).html', response.url).group(1)
        category = generate_product_category(response)
        yield scrapy.Request(
            'http://review.suning.com/ajax/review_lists/general-000000000' + product_id2 +
                '-' + product_id1 + '-total-1-default-10-----reviewList.htm',
            callback=self.generate_product_comment,
            meta={"page_key": 1, "category": category, "url": response.url}
        )

suning.py 文件源码项目：lichking 作者: melonrun 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def generate_comment_usefulcnt(self, response):
        review_userful = json.loads(re.search('usefulCnt\((.*)\)', response.body).group(1))
        if 'reviewUsefuAndReplylList' in review_userful:
            useful_dict = review_userful.get('reviewUsefuAndReplylList')
            suning_item = YSuningItem()
            c_id = str(useful_dict[0].get('commodityReviewId'))
            suning_item._id = c_id
            suning_item.useful_vote_count = str(useful_dict[0].get('usefulCount'))
            suning_item.replies = str(useful_dict[0].get('replyCount'))
            if useful_dict[0].get('replyCount') > 0:
                yield scrapy.Request(
                    'https://review.suning.com/ajax/reply_list/' + c_id + '--1-replylist.htm',
                    callback=self.generate_comment_replylist
                )
            MongoClient.save_suning_usefulcnt(suning_item, YSuningItem)

shayu_spider.py 文件源码项目：lichking 作者: melonrun 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def generate_forum_list(self, response):
        forum_list = response.xpath('//a/@href').extract()
        if len(forum_list) > 0:
            for forum_url in forum_list:
                url = re.search(u'http://www.18095.com/forum-\d{1,10}-1.html', forum_url)
                if url is not None:
                    yield scrapy.Request(
                        forum_url,
                        meta={"page_key": 1},
                        callback=self.generate_forum_list
                    )

        page_key = int(response.meta['page_key'])
        rep_time_list = response.xpath('//tr/td[@class="by"]/em/a').extract()
        if len(response.xpath('//span[@id="fd_page_bottom"]//a[@class="nxt"]/@href').extract()) != 0:
            if page_key == 1 or self.check_rep_date(rep_time_list):
                nxt_page = \
                    response.xpath('//span[@id="fd_page_bottom"]//a[@class="nxt"]/@href').extract()[0]
                yield scrapy.Request(
                    nxt_page,
                    meta={"page_key": -1},
                    callback=self.generate_forum_list
                )

                thread_list = response.xpath('//a[contains(@class,"xst")]/@href').extract()
                if len(thread_list) > 0:
                    logging.error(len(thread_list))
                    for thread_url in thread_list:
                        yield scrapy.Request(
                            thread_url,
                            callback=self.generate_forum_thread
                        )

gfan.py 文件源码项目：lichking 作者: melonrun 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def start_requests(self):
        # get into the bbs
        yield scrapy.Request(
            self.start_urls[0],
            meta={"page_key": 1},
            callback=self.generate_forum_list
        )
        # yield scrapy.Request(
        #     'http://bbs.gfan.com/forum-1686-1.html',
        #     callback=self.generate_forum_page_list
        # )

gfan.py 文件源码项目：lichking 作者: melonrun 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def generate_forum_list(self, response):
        forum_list = re.findall(u'http://bbs.gfan.com/forum-[\d]+-1.html', response.body)
        if len(forum_list) > 0:
            for forum_url in forum_list:
                if forum_url not in self.forum_dict:
                    yield scrapy.Request(
                        forum_url,
                        meta={"page_key": 1},
                        callback=self.generate_forum_list
                    )

        pg_bar = response.xpath('//div[@class="pg"]//a[@class="nxt"]/@href').extract()
        rep_time_list = response.xpath('//tr/td[@class="by"]/em/a').extract()
        page_key = int(response.meta['page_key'])
        if len(pg_bar) > 0:
            if page_key == 1 or self.check_rep_date(rep_time_list):
                yield scrapy.Request(
                    pg_bar[0],
                    meta={"page_key": -1},
                    callback=self.generate_forum_list
                )

            thread_list = response.xpath('//a[@class="xst"]/@href').extract()
            logging.error(len(thread_list))
            if len(thread_list) > 0:
                for thread_url in thread_list:
                    yield scrapy.Request(
                        thread_url,
                        callback=self.generate_forum_thread
                    )

angeeks.py 文件源码项目：lichking 作者: melonrun 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def generate_forum(self, response):
        forum_list = response.xpath('//td[@class="fl_g"]//dl//dt//a/@href').extract()
        if len(forum_list) > 0:
            for forum_url in forum_list:
                f_url = forum_url
                yield scrapy.Request(
                    f_url,
                    meta={"page_key": 1},
                    callback=self.generate_forum
                )

        # check ??????
        rep_time_list = response.xpath('//tr/td[@class="by"]/em/a').extract()
        page_key = int(response.meta['page_key'])
        pg_bar = response.xpath('//div[@class="pg"]//a[@class="nxt"]/@href').extract()
        if page_key == 1 or self.check_rep_date(rep_time_list):
            if len(pg_bar) > 0:
                yield scrapy.Request(
                    pg_bar[0],
                    meta={"page_key": -1},
                    callback=self.generate_forum
                )

            # scrapy all tie url
            thread_list = response.xpath('//a[contains(@class,"xst")]/@href').extract()
            logging.error(len(thread_list))
            if len(thread_list) > 0:
                for thread_url in thread_list:
                    yield scrapy.Request(
                        thread_url,
                        callback=self.generate_forum_thread
                    )

zhiyoo.py 文件源码项目：lichking 作者: melonrun 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def generate_forum(self, response):
        forum_list = response.xpath('//td[@class="fl_g"]//dl//dt//a/@href').extract()
        if len(forum_list) > 0:
            for forum_url in forum_list:
                f_url = forum_url
                if forum_url.find('bbs.zhiyoo.com') == -1:
                    f_url = 'http://bbs.zhiyoo.com/' + forum_url

                yield scrapy.Request(
                    f_url,
                    meta={"page_key": 1, "proxy": MongoClient.get_random_proxy()},
                    callback=self.generate_forum
                )

        # check ??????
        pg_bar = response.xpath('//div[@class="pg"]//a[@class="nxt"]/@href').extract()
        page_key = int(response.meta['page_key'])
        rep_time_list = response.xpath('//tr/td[@class="by"]/em/a').extract()
        # ???????????
        if len(pg_bar) > 0:
            if page_key == 1 or self.check_rep_date(rep_time_list):
                yield scrapy.Request(
                    pg_bar[0],
                    meta={"page_key": -1, "proxy": MongoClient.get_random_proxy()},
                    callback=self.generate_forum
                )
                # scrapy all tie url
                thread_list = response.xpath('//a[@class="xst"]/@href').extract()
                if len(thread_list) > 0:
                    for thread_url in thread_list:
                        yield scrapy.Request(
                            thread_url,
                            meta={"proxy": MongoClient.get_random_proxy()},
                            callback=self.generate_forum_thread
                        )

tieba.py 文件源码项目：lichking 作者: melonrun 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def get_record_list(self, response):
        content = response.body
        content = content.replace('<!--', '')
        content = content.replace('-->', '')
        tree = etree.HTML(content)
        url_list = tree.xpath('//*[@id="thread_list"]//a/@href')
        category = response.meta['category']
        for i in url_list:
            if '/p/' in i and 'http://' not in i:
                tie_url = 'http://tieba.baidu.com' + i
                yield scrapy.Request(
                    tie_url,
                    meta={"category": category},
                    callback=self.get_record_page_num
                )
        # check last reply time, ???????????? 12:12
        rep_time = tree.xpath('//span[contains(@class,"threadlist_reply_date")]/text()')
        if self.check_rep_date(rep_time[0]):
            next_page = tree.xpath('//a[contains(@class, "next")]/text()')
            if len(next_page) > 0:
                logging.error(next_page[0])
                page_key = int(response.meta['page_key']) + 50
                url = 'http://tieba.baidu.com/f?ie=utf-8&kw=' + category + '&fr=search&pn=' + str(page_key)
                yield scrapy.Request(
                    url,
                    meta={"page_key": page_key, "category": category},
                    callback=self.get_record_list
                )