python类CrawlerProcess()的实例源码

run_spider.py 文件源码 项目:rental 作者: meihuanyu 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def runspider(name):
    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='log/%s.log' % name,
        format='%(levelname)s %(asctime)s: %(message)s',
        level=logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runspider start spider:%s' % name)
        process.crawl(name)
        process.start()
    except Exception as e:
        logging.exception('runspider spider:%s exception:%s' % (name, e))

    logging.debug('finish this spider:%s\n\n' % name)
index.py 文件源码 项目:web-search-engine 作者: AnthonySigogne 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def index_job(link) :
    """
    Index a single page.
    """
    print("index page : %s"%link)

    # get final url after possible redictions
    try :
        link = url.crawl(link).url
    except :
        return 0

    process = CrawlerProcess({
        'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
        'DOWNLOAD_TIMEOUT':100,
        'REDIRECT_ENABLED':False,
        'SPIDER_MIDDLEWARES' : {
            'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':True
        }
    })
    process.crawl(crawler.SingleSpider, start_urls=[link,], es_client=client, redis_conn=redis_conn)
    process.start() # block until finished
tasks.py 文件源码 项目:osp-scraper 作者: opensyllabus 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def crawl(spider, *args, **kwargs):
    """Run a spider.

    Args:
        spider (str): The Scrapy `name` of the spider.
    """
    settings = get_project_settings()
    if kwargs.get('ignore_robots_txt') is True:
        settings.attributes.get('ROBOTSTXT_OBEY').value = False

    proc = CrawlerProcess(settings)
    try:
        proc.crawl(spider, *args, **kwargs)
        proc.start()
    except KeyError as err:
        # Log a warning if the scraper name is invalid instead of
        # causing the job to fail.
        # NOTE: If there is any other type of error, the job will fail, and all
        # the jobs that depend on it will fail as well.
        logger.warning(err.args[0])
single_crawler.py 文件源码 项目:Newscrawler 作者: JBH168 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def load_crawler(self, crawler, url, ignore_regex):
        """
        Loads the given crawler with the given url.

        :param class crawler: class of the crawler to load
        :param str url: url to start the crawler with
        :param regex ignore_regex: to be able to ignore urls that match this
                                   regex code
        """
        self.process = CrawlerProcess(self.cfg.get_scrapy_options())
        self.process.crawl(
            crawler,
            self.helper,
            url=url,
            config=self.cfg,
            ignore_regex=ignore_regex)
cli.py 文件源码 项目:feeds 作者: nblock 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def crawl(ctx, spiders, stats):
    """
    Crawl one or many or all pages.

    What spider(s) to run is determined in the following order:

      1. Spider(s) given as argument(s)

      2. Spider(s) specified in the configuration file

    Note that if a spider is given as an argument, the spiders in the
    configuration file are ignored. All available spiders will be used to
    crawl if no arguments are given and no spiders are configured.
    """
    settings = ctx.obj['settings']
    if stats:
        settings.set('STATS_CLASS',
                     'scrapy.statscollectors.MemoryStatsCollector')

    # Start a new crawler process.
    process = CrawlerProcess(settings)
    spiders = spiders_to_crawl(process, spiders)
    if not spiders:
        logger.error('Please specify what spiders you want to run!')
    else:
        for spider in spiders:
            logger.info('Starting crawl of {} ...'.format(spider))
            process.crawl(spider)

    process.start()

    if settings.getbool('HTTPCACHE_ENABLED'):
        run_cleanup_cache(settings)
cli.py 文件源码 项目:feeds 作者: nblock 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def cleanup(ctx):
    """
    Cleanup old cache entries.

    By default, entries older than 14 days will be removed. This value can be
    overriden in the config file.
    """
    settings = ctx.obj['settings']
    # Manually configure logging since we don't have a CrawlerProcess which
    # would take care of that.
    configure_logging(settings)

    if not settings.getbool('HTTPCACHE_ENABLED'):
        logger.error('Cache is disabled, will not clean up cache dir.')
        return 1

    run_cleanup_cache(settings)
runspider.py 文件源码 项目:jd_comment 作者: awolfly9 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def runspider(name, product_id):
    configure_logging(install_root_handler = False)
    logging.basicConfig(
            filename = 'log/%s.log' % product_id,
            format = '%(levelname)s %(asctime)s: %(message)s',
            level = logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runscrapy start spider:%s' % name)
        data = {
            'product_id': product_id
        }
        process.crawl(name, **data)
        process.start()
    except Exception, e:
        logging.error('runscrapy spider:%s exception:%s' % (name, e))
        pass

    logging.info('finish this spider:%s\n\n' % name)
main.py 文件源码 项目:wallstreetcnScrapy 作者: jianzhichun 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def main():
#     process = CrawlerProcess()
#     process.crawl(CommentarySpider.CommentarySpider,args=['-s','JOBDIR=crawls/CommentarySpider-1'])
#     process.crawl(jqkaCommentarySpider.jqkaCommentarySpider,args=['-s','JOBDIR=crawls/CommentarySpider-2'])
#     process.crawl(sinaCommentarySpider.sinaCommentarySpider,args=['-s','JOBDIR=crawls/CommentarySpider-3'])
#     process.start()
#     scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'CommentarySpider','-s','JOBDIR=crawls/CommentarySpider-1'])
    scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'jqkaCommentarySpider','-s','JOBDIR=crawls/CommentarySpider-2'])
#     scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'sinaCommentarySpider','-s','JOBDIR=crawls/CommentarySpider-3'])
run_spider.py 文件源码 项目:IPProxyTool 作者: awolfly9 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def runspider(name):
    configure_logging(install_root_handler=False)
    logging.basicConfig(
        filename='log/%s.log' % name,
        format='%(levelname)s %(asctime)s: %(message)s',
        level=logging.DEBUG
    )
    process = CrawlerProcess(get_project_settings())
    try:
        logging.info('runspider start spider:%s' % name)
        process.crawl(name)
        process.start()
    except Exception as e:
        logging.exception('runspider spider:%s exception:%s' % (name, e))

    logging.debug('finish this spider:%s\n\n' % name)
single_crawler.py 文件源码 项目:news-please 作者: fhamborg 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def load_crawler(self, crawler, url, ignore_regex):
        """
        Loads the given crawler with the given url.

        :param class crawler: class of the crawler to load
        :param str url: url to start the crawler with
        :param regex ignore_regex: to be able to ignore urls that match this
                                   regex code
        """
        self.process = CrawlerProcess(self.cfg.get_scrapy_options())
        self.process.crawl(
            crawler,
            self.helper,
            url=url,
            config=self.cfg,
            ignore_regex=ignore_regex)
scrapyctl.py 文件源码 项目:kmanga 作者: aplanas 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, accounts, loglevel, remote=False):
        self.accounts = settings.SCRAPY_ACCOUNTS
        if accounts:
            self.accounts.update(accounts)
        self.loglevel = loglevel
        self.settings = self._get_settings()
        # Values for `loglevel`: CRITICAL, ERROR, WARNING, INFO, DEBUG.
        self.settings.set('LOG_LEVEL', loglevel)
        if remote:
            # Configure remote logging and disable the scrapy logging.
            self.settings.set('LOG_ENABLED', False)
            logger = logging.getLogger()
            handler = ScrapySocketHandler(
                'localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT)
            handler.setLevel(loglevel)
            logger.addHandler(handler)

        self.process = CrawlerProcess(self.settings)
collector.py 文件源码 项目:collectors 作者: opentrials 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def collect(conf, conn, date_from=None, date_to=None):
    process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
    process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
    process.start()
crawl.py 文件源码 项目:hoaxy-backend 作者: IUNetSci 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def fetch_url(cls, session, msites, platform_id, purpose):
        """Actual method to do fetch url action.

        Parameters
        ----------
            msites : list
                a list of Site model class, contains info to build spiders.
            platform_id : int
                id of platform, bind fetched url with this id.
            purpose : {'update', 'archive'}
                indicate which url to fetch.
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.UrlPipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        for ms in msites:
            for sm in build_spiders_iter(ms, purpose):
                sm['kwargs']['session'] = session
                sm['kwargs']['platform_id'] = platform_id
                process.crawl(sm['cls'], *sm['args'], **sm['kwargs'])
        process.start()
crawl.py 文件源码 项目:hoaxy-backend 作者: IUNetSci 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def fetch_html(cls, session, url_tuples):
        """Actual method to do fetch html action.

        Parameters
        ----------
            session : object
                a SQLAlchemy session object.
            url_tuples : list
                a list of url tuple (id, raw, status_code).
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.HtmlPipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        logger.warning('Number of url to fetch html is: %s', len(url_tuples))
        process.crawl(
            HtmlSpider,
            session=session,
            url_tuples=url_tuples,
            excluded_domains=cls.conf['crawl']['excluded_domains'])
        process.start()
crawl.py 文件源码 项目:hoaxy-backend 作者: IUNetSci 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def parse_article(cls, session, url_tuples):
        """Actual method to do parse to article action.

        Parameters
        ----------
            session : object
                a SQLAlchemy session object.
            url_tuples : list
                a list of url tuple (id, created_at, date_published,
                canonical, site_id)
        """
        settings = Settings(cls.conf['crawl']['scrapy'])
        settings.set('ITEM_PIPELINES',
                     {'hoaxy.crawl.pipelines.ArticlePipeline': 300})
        process = CrawlerProcess(settings)
        sll = cls.conf['logging']['loggers']['scrapy']['level']
        logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
        logger.info('Number of url to parse is: %s', len(url_tuples))
        process.crawl(
            ArticleParserSpider,
            session=session,
            url_tuples=url_tuples,
            api_key=cls.conf['crawl']['article_parser']['webparser_api_key'],)
        process.start()
EuropythonSpyder.py 文件源码 项目:pydata_webscraping 作者: jmortega 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def main():
    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item extracted:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # definir el spider para el crawler
    crawler.crawl(EuropythonSpyder())

    # iniciar scrapy
    print "STARTING ENGINE"
    crawler.start() #iniciar el crawler llamando al spider definido
    print "ENGINE STOPPED"
crawlerBlog.py 文件源码 项目:pydata_webscraping 作者: jmortega 项目源码 文件源码 阅读 46 收藏 0 点赞 0 评论 0
def main():
    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item Extraido:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # definir el spider para el crawler
    crawler.crawl(BloggerSpider())

    # iniciar scrapy
    print "STARTING ENGINE"
    crawler.start() #iniciar el crawler llamando al spider definido
    print "ENGINE STOPPED"
PydataSpiderDetails.py 文件源码 项目:pydata_webscraping 作者: jmortega 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def main():
    from scrapy.xlib.pydispatch import dispatcher

    """Rutina principal para la ejecución del Spider"""
    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        print "Item extracted:", item
    dispatcher.connect(catch_item, signal=signals.item_passed)

    settings = Settings()
    settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
    settings.set("LOG_ENABLED",False)   

    # setup crawler
    from scrapy.crawler import CrawlerProcess

    crawler = CrawlerProcess(settings)

    # define spyder for the crawler
    crawler.crawl(PydataSpiderDetails())

    print "STARTING ENGINE"
    crawler.start() #start  the crawler
    print "ENGINE STOPPED"
ProcessRun.py 文件源码 项目:Python_Stock_Github 作者: DavidFnck 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def ProcessRun():

    process = CrawlerProcess(get_project_settings())
    # ????spider
    process.crawl("news")
    # process.crawl("favorite_spider")
    # ???? spider
    for spider_name in process.spider_loader.list():
        # print spider_name
        process.crawl(spider_name)
    process.start()
runspider.py 文件源码 项目:caoliuscrapy 作者: leyle 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def run(max_page=5):
    settings = get_project_settings()
    settings.set('MAX_PAGE', max_page, 'project')
    crawler_process = CrawlerProcess(settings)
    crawler_process.crawl(CaoLiuSpider)
    crawler_process.start()
crawl.py 文件源码 项目:makinami 作者: Coderhypo 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def __init__(self):
        self.crawler = CrawlerProcess(settings)
cli.py 文件源码 项目:StrepHit 作者: Wikidata 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def crawl(spider_name, results_dir):
    """ Run one or more spiders """
    settings = get_project_settings()
    # prevent scrapy from configuring its own logging, since we already have it
    settings.set('LOG_ENABLED', False)

    process = CrawlerProcess(settings)
    for s in spider_name:
        process.settings.set('FEED_URI',
                             'file://%s.jsonlines' % os.path.join(results_dir, s))
        process.settings.set('FEED_FORMAT', 'jsonlines')
        spider = process.spider_loader.load(s)
        process.crawl(spider)
    process.start()
Crawl.py 文件源码 项目:Pixiv-Spider 作者: cathor01 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def main():
    settings = get_project_settings()
    process = CrawlerProcess(settings)
    process.crawl("pixiv")
    process.start()
initiator.py 文件源码 项目:wechat-crawler 作者: DMGbupt 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def crawl(spiders, query, start, end, page):
    spider_logger.info("Start crawling {0} from {1} to {2}".format(query, start, end))
    process = CrawlerProcess(get_project_settings())
    process.crawl(spiders, query=query, start_time=start, end_time=end, index_pages=page)
    process.start()
main.py 文件源码 项目:Music-Scraper 作者: srivatsan-ramesh 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def start_gui(process):
    """
    A function that takes care of starting the GUI and stops the Scrapy crawler process when exited from program.

    :param CrawlerProcess process: The scrapy crawler process that is used to scrape the web. The instance is used for stopping the process.
    """

    def create_ui(screen):
        """
        A function passes to curses wrapper for safe execution of terminal GUI.

        :param screen: The screen parameter to run the GUI. Sent from the curses wrapper.
        """

        GUI.screen = screen  # All the statis variables of the GUI class is initialized
        GUI.strings = []  # the list of songs is empty initially
        GUI.init_display()  # init the variables required for GUI
        GUI.update_on_key()  # Starts a loop that waits for key input and acts accordingly

        curses.nocbreak()
        curses.echo()
        curses.endwin()
        GUI.gui_stopped = True

    curses.wrapper(create_ui)
    process.stop()  # Stopping the scrapy crawler process
searchMiner.py 文件源码 项目:twitter-sentiment 作者: words-sdsc 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def startCrawler():
    """ Initiates process of the web crawler above.

    Arguments: None

    Return: None
    """

    # Starts a Twisted reactors to configure logs and set shutdown handlers
    process = CrawlerProcess({
        'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
    })
    process.crawl(TwitterSpider)
    process.start()
cli.py 文件源码 项目:feeds 作者: nblock 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def list():
    """List all available spiders."""
    settings = get_project_settings()
    settings['LOG_ENABLED'] = False
    process = CrawlerProcess(settings)
    for s in sorted(process.spider_loader.list()):
        print(s)
scraper.py 文件源码 项目:czl-scrape 作者: code4romania 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def main():
    from scrapy.crawler import CrawlerProcess, Crawler
    process = CrawlerProcess()
    process.crawl(EducatieSpider)
    process.start()
comunicatii.py 文件源码 项目:czl-scrape 作者: code4romania 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def main():
    process = CrawlerProcess()
    process.crawl(ComunicatiiSpider)
    process.start()
dialog.py 文件源码 项目:czl-scrape 作者: code4romania 项目源码 文件源码 阅读 44 收藏 0 点赞 0 评论 0
def main():
    from scrapy.crawler import CrawlerProcess
    process = CrawlerProcess()
    process.crawl(DialogSpider)
    process.start()


问题


面经


文章

微信
公众号

扫码关注公众号