python类CrawlerProcess()的实例源码

run.py 文件源码 项目:decoration-design-crawler 作者: imflyn 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def start_scrapy(self):
        self.process = CrawlerProcess(self.setting)
        self.crawl()
        reactor.run()
run.py 文件源码 项目:decoration-design-crawler 作者: imflyn 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def start_scrapy(self):
        self.process = CrawlerProcess(self.setting)
        self.crawl()
        reactor.run()
ScrapeReviewCounts.py 文件源码 项目:Get-Positive 作者: M-shin 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def getReviewCount(url):
  # Get the number of reviews
  process = CrawlerProcess(get_project_settings())
  process.crawl(review_count_spider, start_url=url)
  process.start()
tasks.py 文件源码 项目:aquam 作者: xncbf 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def crawl_naver_blog():
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
    })
    process.crawl(NaverBlogSpider)
    process.start()
__main__.py 文件源码 项目:wayback-machine-scraper 作者: sangaline 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def main():
    # configure the settings for the crawler and spider
    args = parse_args()
    config = {
        'domains': args.domains,
        'directory': args.output,
        'allow': args.allow,
        'deny': args.deny,
        'unix': args.unix,
    }
    settings = Settings({
        'USER_AGENT': (
            'Wayback Machine Scraper/{0} '
            '(+https://github.com/sangaline/scrapy-wayback-machine)'
        ).format(get_distribution('wayback-machine-scraper').version),
        'LOG_LEVEL': 'DEBUG' if args.verbose else 'INFO',
        'DOWNLOADER_MIDDLEWARES': {
            'scrapy_wayback_machine.WaybackMachineMiddleware': 5,
        },
        'AUTOTHROTTLE_ENABLED': True,
        'AUTOTHROTTLE_DEBUG': args.verbose,
        'AUTOTHROTTLE_START_DELAY': 1,
        'AUTOTHROTTLE_TARGET_CONCURRENCY': args.concurrency,
        'WAYBACK_MACHINE_TIME_RANGE': (getattr(args, 'from'), args.to),
    })

    # start the crawler
    process = CrawlerProcess(settings)
    process.crawl(MirrorSpider, **config)
    process.start()
run.py 文件源码 项目:blog_analysis 作者: Databrawl 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def get_top_blogs():
    """
    Get URLs of most popular blog posts for most popular programming languages
    on GitHub.
    """
    languages = get_top_languages(30, settings['LANGUAGES_DATA'])
    process = CrawlerProcess(settings)
    process.crawl(BlogsSpider, languages)
    process.start()  # the script will block here until the crawling is done
run.py 文件源码 项目:blog_analysis 作者: Databrawl 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def estimate_traffic():
    """
    Analyze traffic of the scraped blogs.
    """
    process = CrawlerProcess(settings)
    blogs_file = get_latest_file(settings['BLOGS_FEED_DIR'])
    with open(blogs_file) as f:
        blogs = json.load(f)
    process.crawl(TrafficSpider, blogs)
    process.start()  # the script will block here until the crawling is done
ignition.py 文件源码 项目:tobber 作者: fchamicapereira 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def run_crawler(self):

        process = CrawlerProcess(self.settings)

        if self.args.anime:
            if self.args.skip == None or 'nyaa' not in self.args.skip:
                process.crawl(Nyaa,  title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'shanaproject' not in self.args.skip:
                process.crawl(Shanaproject,  title=self.search, season=self.args.season, file=self.args.file)

        else:
            if self.args.skip == None or 'zooqle' not in self.args.skip:
                process.crawl(Zooqle, title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or '1337x' not in self.args.skip:
                process.crawl(_1337x, title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'eztv' not in self.args.skip:
                process.crawl(Eztv,   title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'rarbg' not in self.args.skip:
                process.crawl(Rarbg, title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'torrentdownloads' not in self.args.skip:
                process.crawl(Torrentdownloads, title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'limetorrents' not in self.args.skip:
                process.crawl(Limetorrents, title=self.search, season=self.args.season, file=self.args.file)

            if self.args.skip == None or 'thepiratebay' not in self.args.skip:
                process.crawl(Thepiratebay, title=self.search, season=self.args.season, file=self.args.file)

        process.start()
index.py 文件源码 项目:web-search-engine 作者: AnthonySigogne 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def explore_job(link) :
    """
    Explore a website and index all urls (redis-rq process).
    """
    print("explore website at : %s"%link)

    # get final url after possible redictions
    try :
        link = url.crawl(link).url
    except :
        return 0

    # create or update domain data
    domain = url.domain(link)
    res = client.index(index="web", doc_type='domain', id=domain, body={
        "homepage":link,
        "domain":domain,
        "last_crawl":datetime.now()
    })

    # start crawler
    process = CrawlerProcess({
        'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
        'DOWNLOAD_TIMEOUT':100,
        'DOWNLOAD_DELAY':0.25,
        'ROBOTSTXT_OBEY':True,
        'HTTPCACHE_ENABLED':False,
        'REDIRECT_ENABLED':False,
        'SPIDER_MIDDLEWARES' : {
            'scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware':True,
            'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':True,
            'scrapy.downloadermiddlewares.httpcache.HttpCacheMiddleware':True,
            'scrapy.extensions.closespider.CloseSpider':True
        },
        'CLOSESPIDER_PAGECOUNT':500 #only for debug
    })
    process.crawl(crawler.Crawler, allowed_domains=[urlparse(link).netloc], start_urls = [link,], es_client=client, redis_conn=redis_conn)
    process.start()

    return 1
hearthstats.py 文件源码 项目:hsdata 作者: youfou 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def _crawl(deck_ids):
        logging.info('????????')
        decks = list()
        cp = CrawlerProcess({'ITEM_PIPELINES': {'hsdata.hearthstats.HearthStatsScrapyPipeline': 1}})
        cp.crawl(HearthStatsScrapySpider, deck_ids=deck_ids, decks=decks)
        cp.start()
        return decks
hsbox.py 文件源码 项目:hsdata 作者: youfou 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def _crawl(deck_ids):
        logging.info('??????????')
        results = list()
        cp = CrawlerProcess({'ITEM_PIPELINES': {'hsdata.hsbox.HSBoxScrapyPipeline': 1}})
        cp.crawl(HSBoxScrapySpider, deck_ids=deck_ids, results=results)
        cp.start()
        logging.info('??? {} ??????????'.format(len(results)))
        return results
crawler.py 文件源码 项目:Pysearch2.0 作者: Pysearch 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def crawl(url):
    """Initialize crawling sequence."""
    settings = get_project_settings()
    settings.url = url
    settings["CLOSESPIDER_PAGECOUNT"] = CRAWL_COUNT
    settings["DEPTH_LEVEL"] = DEPTH_LEVEL
    process = CrawlerProcess(settings)

    class ThisSpider(CrawlingSpider):
        """Create a spider to crawl with."""

        start_urls = [url]
    process.crawl(ThisSpider)
    process.start()
harvester.py 文件源码 项目:Pysearch2.0 作者: Pysearch 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def harvest(url):
    """Initialize harvest sequence."""
    settings = get_project_settings()
    settings.url = url
    process = CrawlerProcess(settings)
    process.crawl(HarvestSpider, url=url)
    process.start()
test_wizard.py 文件源码 项目:hermes 作者: xutaoding 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def test_waizard_spider():
    crawler = CrawlerProcess(get_project_settings())
    crawler.crawl(WizardSpider)
    crawler.start()
main.py 文件源码 项目:myaddons 作者: luohuayong 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def run():
    # ??settings.py?????
    settings = get_project_settings()
    process = CrawlerProcess(settings=settings)

    # ??????spider
    # process.crawl(Spider1)
    # process.crawl(Spider2)
    process.crawl(GuaziSaleSpider)

    # ???????????????
    process.start()
collector.py 文件源码 项目:collectors 作者: opentrials 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def collect(conf, conn):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn)
    process.start()
collector.py 文件源码 项目:collectors 作者: opentrials 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def collect(conf, conn, page_from=None, page_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, page_from=page_from, page_to=page_to)
    process.start()
collector.py 文件源码 项目:collectors 作者: opentrials 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def collect(conf, conn):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn)
    process.start()
collector.py 文件源码 项目:collectors 作者: opentrials 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def collect(conf, conn, date_from=None, date_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
    process.start()
collector.py 文件源码 项目:collectors 作者: opentrials 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def collect(conf, conn, date_from=None, date_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
    process.start()


问题


面经


文章

微信
公众号

扫码关注公众号