python类Spider()的实例源码

index_spider.py 文件源码 项目:wechat-crawler 作者: DMGbupt 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def parse_search(self, response):
        """
        @summary: ?????????????request???????
        @param response:start_requests()?????????????
        """
        # ???????????????????????"antispider"??
        # ????"antispider"???????????????????????????
        if "antispider" in response.url:
            spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url)
            time.sleep(43200) # ??????????????
            raise CloseSpider('antispider')
        # ext????????????????json????url???????????????
        ext = response.xpath(
            '//div[@class="wx-rb bg-blue wx-rb_v1 _item"][1]/@href').extract() # ?????????????????????????????ext??
        if not ext:
            spider_logger.error("Faild searching {0} !".format(response.meta['query']))
            return
        # ???????json???url?????????10?????????????1?(page=1????)?url
        json_url = "".join(ext).replace('/gzh?','http://weixin.sogou.com/gzhjs?')+'&cb=sogou.weixin_gzhcb&page=1&gzhArtKeyWord='
        cookies = response.meta['cookies']
        yield Request(json_url, callback= self.parse_index, cookies=cookies, meta ={'cookies':cookies})
test_exporter.py 文件源码 项目:scrapy_rss 作者: woxcab 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __init__(self, feed_file=None, feed_title=None, feed_link=None, feed_description=None,
                 crawler_settings=None):
        settings = crawler_settings if crawler_settings else dict(self.default_settings)
        if feed_file:
            settings['FEED_FILE'] = feed_file
        if feed_title:
            settings['FEED_TITLE'] = feed_title
        if feed_link:
            settings['FEED_LINK'] = feed_link
        if feed_description:
            settings['FEED_DESCRIPTION'] = feed_description
        self.crawler = get_crawler(settings_dict=settings)
        self.spider = scrapy.Spider.from_crawler(self.crawler, 'example.com')
        self.spider.parse = lambda response: ()
        item_processor = settings.get('ITEM_PROCESSOR')
        if not item_processor:
            item_processor = RaisedItemPipelineManager
        elif isinstance(item_processor, six.string_types):
            item_processor = load_object(item_processor)

        self.ipm = item_processor.from_crawler(self.crawler)
Tieba_search.py 文件源码 项目:Crawlers 作者: mi-minus 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def __init__(self, *args, **kwargs):
        super(scrapy.Spider,self).__init__(*args, **kwargs)
        super(TiebaSearchSpider,self).__init__()
        self.dig_pattern = re.compile('(\d+)')
        self.postid_pattern = re.compile('/p/(\d{10})')
        self.page_all=1
        self.site_id=2
        self.site_name = u'tieba_search'
        self.Flag_List = []
        self.Maxpage_List = []
        self.MAX_PAGE_NUM = 5
        self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0',
                'Host': 'www.baidu.com',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
        }
B_search_spider.py 文件源码 项目:Crawlers 作者: mi-minus 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __init__(self, *args, **kwargs):
        super(scrapy.Spider,self).__init__(*args, **kwargs)
        # self.sqldb = SqliteTime(self.name)        
        super(DmozSpider_search,self).__init__()
        self.dig_pattern = re.compile('(\d+)')
        self.postid_pattern = re.compile('/p/(\d{10})')
        self.page_all=1
        self.site_id = 500
        self.Flag_List = []
        self.Maxpage_List = [] 
        self.MAX_PAGE_NUM = 5        
        self.headers={
                      'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                      'Accept-Language' : 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3',
                      'Connection' : 'keep-alive',
                      'DNT' : '1',
                      'Host' : 'www.baidu.com',
                      'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0',
                      }
        self.content_pa1=re.compile('</div>(.*?)<br',re.S)
tests.py 文件源码 项目:sbdspider 作者: onecer 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def setUp(self):
        self.spider = Spider('myspider')
        self.key = 'scrapy_redis:tests:%s:queue' % self.spider.name
        self.q = self.queue_cls(self.server, Spider('myspider'), self.key)
tests.py 文件源码 项目:sbdspider 作者: onecer 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def setUp(self):
        self.persist = False
        self.key_prefix = 'scrapy_redis:tests:'
        self.queue_key = self.key_prefix + '%(spider)s:requests'
        self.dupefilter_key = self.key_prefix + '%(spider)s:dupefilter'
        self.idle_before_close = 0
        self.scheduler = Scheduler(self.server, self.persist, self.queue_key,
                                   SpiderQueue, self.dupefilter_key,
                                   self.idle_before_close)
        self.spider = Spider('myspider')
index_spider.py 文件源码 项目:wechat-crawler 作者: DMGbupt 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def __init__(self,
                 query=None,
                 start_time=None,
                 end_time=None,
                 index_pages=None):
        """
        @summary: ?????????, ?????????????
        @param query: ???,???????
        @param start_time: ????????start_time???????????????????????
        @param end_time: ????????end_time?????
        @param index_pages: ?????????????
        """
        # ??????????????????????????
        if query:
            self.query = query # self.query????????????
        else:
            # ???????????????????????
            spider_logger.error("Spider need single search word each time!Check input!")
            raise CloseSpider('invaild search word')
        # ???????????????100??
        if start_time:
            self.from_time = start_time
        else:
            self.from_time = datetime.now()-timedelta(days=100)  # ????100??
        # ?????????????
        if end_time:
            self.end_time = end_time
        else:
            self.end_time = datetime.now()  # ????????
        # ???????
        if index_pages:
            self.index_pages = int(index_pages)
        else:
            self.index_pages = 10 # ????10?
index_spider.py 文件源码 项目:wechat-crawler 作者: DMGbupt 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def parse_index(self, response):
        """
        @summary: ?????????????????Request??
        @param response: parse_search()?????????????
        @return: list????????????url???????????
        """
        if "antispider" in response.url:
            spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url)
            time.sleep(43200)
            raise CloseSpider('antispider')
        requests = []
        page_list = self._get_result(response)
        # ???????????????
        if not page_list:
            return requests
        next_page = True  # ????????
        # ???????????????
        for item in page_list:
            if isinstance(item, Request):  # ?????Request
                requests.append(item)
                next_page = False
                break
            if item['publish_time'] <= self.from_time:  # ????????self.from_time
                next_page = False
                break
            elif item['publish_time'] > self.end_time:  # ????????self.end_time
                continue
            else:
                req = Request(item['url'], self.parse_page)
                # ???????
                req.meta["item"] = item
                requests.append(req)
        # ?????,??????Request;???????
        if next_page and self._next_result_page(response):
            cookies = response.meta['cookies']
            requests.append(Request(self._next_result_page(response),callback=self.parse_index,cookies=cookies, meta ={'cookies':cookies}))
        return requests
index_spider.py 文件源码 项目:wechat-crawler 作者: DMGbupt 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def parse_page(self, response):
        """
        @summary: ??????
        @param response: parse_index()?????????????
        @return: ?????_finish_item()??????
        """
        if "antispider" in response.url:
            spider_logger.error("Closing spider for verification code received in %s ! Spider will restart automatically after 12 hours!" % response.url)
            time.sleep(43200)
            raise CloseSpider('antispider')
        item = response.meta["item"]
        return self._finish_item(item, response)
tuCarroWholeSiteScrapper.py 文件源码 项目:used-cars-Colombia 作者: john-guerra 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init__(self):
      scrapy.Spider.__init__(self)
      self.download_delay = 1/float(self.rate)
tests.py 文件源码 项目:scrappy 作者: DormyMo 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def setUp(self):
        self.spider = Spider('myspider')
        self.key = 'scrapy_redis:tests:%s:queue' % self.spider.name
        self.q = self.queue_cls(self.server, Spider('myspider'), self.key)
tests.py 文件源码 项目:scrappy 作者: DormyMo 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def setUp(self):
        self.persist = False
        self.key_prefix = 'scrapy_redis:tests:'
        self.queue_key = self.key_prefix + '%(spider)s:requests'
        self.dupefilter_key = self.key_prefix + '%(spider)s:dupefilter'
        self.idle_before_close = 0
        self.scheduler = Scheduler(self.server, self.persist, self.queue_key,
                                   SpiderQueue, self.dupefilter_key,
                                   self.idle_before_close)
        self.spider = Spider('myspider')
test_media_pipeline.py 文件源码 项目:scrapy-cdr 作者: TeamHG-Memex 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def __init__(self, url):
        super(Spider, self).__init__()
        self.start_urls = [url]
        self.le = LinkExtractor(canonicalize=False)
        self.files_le = LinkExtractor(
            tags=['a'], attrs=['href'], deny_extensions=[], canonicalize=False)
test_media_pipeline.py 文件源码 项目:scrapy-cdr 作者: TeamHG-Memex 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def make_crawler(**extra_settings):
    settings = Settings()
    settings['ITEM_PIPELINES'] = {
        'scrapy_cdr.media_pipeline.CDRMediaPipeline': 1,
        'tests.utils.CollectorPipeline': 100,
    }
    settings.update(extra_settings)
    runner = CrawlerRunner(settings)
    return runner.create_crawler(Spider)
music_spider.py 文件源码 项目:Music-Scraper 作者: srivatsan-ramesh 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def parse(self, response):
        """
        Override function of the class scrapy.Spider. Called when response is obtained
        :param response: Response object used to get the details of the webpage
        """
        for href in response.xpath("//a/@href").extract():
            # Iterating over all the urls in the google search page
            if href[:7] == '/url?q=' and is_standard_website(href):
                # Getting the search results alone
                url = href[7:].split('&')[0]
                # starting another request for each search result url
                yield scrapy.Request(url, meta={'download_maxsize': 2097152}, callback=self.parse_result_contents)
middlewares.py 文件源码 项目:decoration-design-crawler 作者: imflyn 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def process_response(self, request, response, spider: Spider):
        if response.status == 302 or response.status == 503:
            self.ERROR_COUNT += 1
            print('????%s' % self.ERROR_COUNT)
            if self.ERROR_COUNT > 100:
                spider.close(spider, 'http status error')
        return response
middlewares.py 文件源码 项目:decoration-design-crawler 作者: imflyn 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def process_response(self, request, response, spider: Spider):
        if response.status == 302 or response.status == 503:
            self.ERROR_COUNT += 1
            print('????%s' % self.ERROR_COUNT)
            if self.ERROR_COUNT > 100:
                spider.close(spider, 'http status error')
        return response
assessment_spider.py 文件源码 项目:assessor-scraper 作者: codefornola 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def parse(self, response):
        """
        Default callback function with response for the crawled url
        https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.parse
        """
        response = response.replace(body=re.sub(r"<br\s*[\/]?>", "\n", response.body.decode('utf=8')))
        property_key = response.url.split('=')[1].replace('&', '')
        # logging.debug("Parsing property_key: %s", property_key)
        property_info = self.parse_property_info(response)
        property_values = self.parse_property_values(response)
        property_sales = self.parse_property_sales(response)
        property_info['property_key'] = property_key
        property_info['sales'] = property_sales
        property_info['values'] = property_values
        yield Property(property_info)
basic_authentication_spider.py 文件源码 项目:Hanhan_NLP 作者: hanhanwu 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def main():
    test_spider = MySpider(scrapy.Spider)
    test_spider.start_requests()
base.py 文件源码 项目:rojak 作者: pyk 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def spider_closed(self, spider, reason):
        spider.logger.info('Spider closed: %s %s', spider.name, reason)
        # if spider finished without error update last_scraped_at
        if reason == 'finished':
            try:
                self.logger.info('Updating media last_scraped_at information')
                self.cursor.execute(sql_update_media, [spider.name])
                self.db.commit()
                self.db.close()
            except mysql.Error as err:
                self.logger.error('Unable to update last_scraped_at: %s', err)
                self.db.rollback()
                self.db.close()
                if self.is_slack:
                    error_msg = '{}: Unable to update last_scraped_at: {}'.format(
                        spider.name, err)
                    self.slack.chat.post_message('#rojak-pantau-errors', error_msg,
                        as_user=True)
        else:
            if self.is_slack:
                # Send error to slack
                error_msg = '{}: Spider fail because: {}'.format(
                    spider.name, reason)
                self.slack.chat.post_message('#rojak-pantau-errors',
                        error_msg, as_user=True)

    # subscibe to item_droped event
baidu_weibo.py 文件源码 项目:Crawlers 作者: mi-minus 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def __init__(self,*args,**kwargs):
        super(scrapy.Spider,self).__init__(*args,**kwargs)
        self.Flag_List = []
        self.Maxpage_List = []
        self.MAX_PAGE_NUM = 76
        self.site_id =  1   #?????
        self.site_name = u'baidu_weibo'
        self.base_url = 'https://www.baidu.com/s?wd=%s&pn=0&cl=2&tn=baiduwb&ie=utf-8&f=3&rtt=2'
        self.topic_kws = None
        self.pa = re.compile('&pn=(\d+)&') 
        self.pa_time = re.compile('\d+')
        self.headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0',
                'Host': 'www.baidu.com',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
        }

        self.headers_weibo = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0',
                'Host': 'weibo.com',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate',
                'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1'
        }
asin_spider.py 文件源码 项目:amazon-scrapy 作者: dynamohuang 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def __init__(self):
        scrapy.Spider.__init__(self)
        pydispatch.dispatcher.connect(self.handle_spider_closed, signals.spider_closed)
        # all asin scrapied will store in the array
        self.asin_pool = []
detail_spider.py 文件源码 项目:amazon-scrapy 作者: dynamohuang 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def __init__(self):
        scrapy.Spider.__init__(self)
        pydispatch.dispatcher.connect(self.handle_spider_closed, signals.spider_closed)
        # all asin scrapied will store in the array
        self.product_pool = {}
        self.log = []
        self.products = []
aiqiyi_spider.py 文件源码 项目:video_url_crawler_demo 作者: czs0x55aa 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def __init__(self):
        scrapy.spiders.Spider.__init__(self)

        self.global_settings = get_project_settings()
        if self.global_settings['PLATFORM'] in ['win', 'mac']:
            self.driver = webdriver.PhantomJS(executable_path= self.global_settings['PHANTOMJS_PATH'])
        elif self.global_settings['PLATFORM'] in ['linux']:
            self.driver = webdriver.PhantomJS()
        self.driver.set_page_load_timeout(30)
        self.driver.implicitly_wait(10)

        self.type_id_list = self.global_settings['CRAWLER']['type_id_list']
        self.re_type_id = re.compile(self.global_settings['CRAWLER']['re_type_id'])
        self.url_template = self.global_settings['CRAWLER']['url_template']
aiqiyi_spider.py 文件源码 项目:video_url_crawler_demo 作者: czs0x55aa 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def __del__(self):
        self.driver.quit()
        scrapy.spiders.Spider.__del__(self)
practise_spider.py 文件源码 项目:Octojobs 作者: OctoJobs 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def start_requests(self):
        """Makes the initial request to the page you want to scrape.
        Returns an iterable of Requests, which the Spider can crawl.
        More requests will be generated successively from initial requests."""
        urls = [
            'https://www.dice.com/jobs/detail/Etl%26%2347Informatica-Production-Support-%26%2347Developer-Pyramid-Consulting%2C-Inc.-Bellevue-WA-98006/pyrmid/16-32835?icid=sr1-1p&q=pyramid&l=Seattle,%20WA',
        ]

        for url in urls:
            """For each url you're sending the spider to, make a request.
            Run parse() on the response object you get back."""
            yield scrapy.Request(url=url, callback=self.parse)
ArchiveSpider.py 文件源码 项目:mailingListScraper 作者: gaalcaras 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def _getbody(self, body):
        "Handle the body argument."

        if body == "false":
            self.get_body = False
            self.logger.info('Spider will not extract email body.')
wizard.py 文件源码 项目:hermes 作者: xutaoding 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def update_settings(cls, settings):
        """ Before initializing Spider Class, configure `job_idf` to spider settings """
        job_idf = settings['job_idf']
        spider_conf = cls.mongo_conf_db.get({'_id': job_idf})
        scry_settings = ExtractorConf(spider_conf).scrapy_settings

        custom_settings = cls.custom_settings or {}
        custom_settings.update(scry_settings)
        settings.setdict(custom_settings, priority='spider')
??????.py 文件源码 项目:User-Python-Write-a-web-crawler 作者: xiexiaoshinick 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def parse(self, response):
        item=MyfirstpjtItem()
        item["urlname"]=response.xpath("/html/head/title/text()")
        print(item["urlname"])

#12.7   Spider
#(1)
# -*- coding: utf-8 -*-
adc_spider.py 文件源码 项目:AmazonScraping 作者: santoshghimire 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def quit(self, spider):
        # second param is instance of spider about to be closed.
        try:
            self.url.db.close()
            print('Spider closed, fetching product urls stopped')
        except:
            pass


问题


面经


文章

微信
公众号

扫码关注公众号