def runspider(name):
configure_logging(install_root_handler=False)
logging.basicConfig(
filename='log/%s.log' % name,
format='%(levelname)s %(asctime)s: %(message)s',
level=logging.DEBUG
)
process = CrawlerProcess(get_project_settings())
try:
logging.info('runspider start spider:%s' % name)
process.crawl(name)
process.start()
except Exception as e:
logging.exception('runspider spider:%s exception:%s' % (name, e))
logging.debug('finish this spider:%s\n\n' % name)
python类CrawlerProcess()的实例源码
def index_job(link) :
"""
Index a single page.
"""
print("index page : %s"%link)
# get final url after possible redictions
try :
link = url.crawl(link).url
except :
return 0
process = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
'DOWNLOAD_TIMEOUT':100,
'REDIRECT_ENABLED':False,
'SPIDER_MIDDLEWARES' : {
'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':True
}
})
process.crawl(crawler.SingleSpider, start_urls=[link,], es_client=client, redis_conn=redis_conn)
process.start() # block until finished
def crawl(spider, *args, **kwargs):
"""Run a spider.
Args:
spider (str): The Scrapy `name` of the spider.
"""
settings = get_project_settings()
if kwargs.get('ignore_robots_txt') is True:
settings.attributes.get('ROBOTSTXT_OBEY').value = False
proc = CrawlerProcess(settings)
try:
proc.crawl(spider, *args, **kwargs)
proc.start()
except KeyError as err:
# Log a warning if the scraper name is invalid instead of
# causing the job to fail.
# NOTE: If there is any other type of error, the job will fail, and all
# the jobs that depend on it will fail as well.
logger.warning(err.args[0])
def load_crawler(self, crawler, url, ignore_regex):
"""
Loads the given crawler with the given url.
:param class crawler: class of the crawler to load
:param str url: url to start the crawler with
:param regex ignore_regex: to be able to ignore urls that match this
regex code
"""
self.process = CrawlerProcess(self.cfg.get_scrapy_options())
self.process.crawl(
crawler,
self.helper,
url=url,
config=self.cfg,
ignore_regex=ignore_regex)
def crawl(ctx, spiders, stats):
"""
Crawl one or many or all pages.
What spider(s) to run is determined in the following order:
1. Spider(s) given as argument(s)
2. Spider(s) specified in the configuration file
Note that if a spider is given as an argument, the spiders in the
configuration file are ignored. All available spiders will be used to
crawl if no arguments are given and no spiders are configured.
"""
settings = ctx.obj['settings']
if stats:
settings.set('STATS_CLASS',
'scrapy.statscollectors.MemoryStatsCollector')
# Start a new crawler process.
process = CrawlerProcess(settings)
spiders = spiders_to_crawl(process, spiders)
if not spiders:
logger.error('Please specify what spiders you want to run!')
else:
for spider in spiders:
logger.info('Starting crawl of {} ...'.format(spider))
process.crawl(spider)
process.start()
if settings.getbool('HTTPCACHE_ENABLED'):
run_cleanup_cache(settings)
def cleanup(ctx):
"""
Cleanup old cache entries.
By default, entries older than 14 days will be removed. This value can be
overriden in the config file.
"""
settings = ctx.obj['settings']
# Manually configure logging since we don't have a CrawlerProcess which
# would take care of that.
configure_logging(settings)
if not settings.getbool('HTTPCACHE_ENABLED'):
logger.error('Cache is disabled, will not clean up cache dir.')
return 1
run_cleanup_cache(settings)
def runspider(name, product_id):
configure_logging(install_root_handler = False)
logging.basicConfig(
filename = 'log/%s.log' % product_id,
format = '%(levelname)s %(asctime)s: %(message)s',
level = logging.DEBUG
)
process = CrawlerProcess(get_project_settings())
try:
logging.info('runscrapy start spider:%s' % name)
data = {
'product_id': product_id
}
process.crawl(name, **data)
process.start()
except Exception, e:
logging.error('runscrapy spider:%s exception:%s' % (name, e))
pass
logging.info('finish this spider:%s\n\n' % name)
def main():
# process = CrawlerProcess()
# process.crawl(CommentarySpider.CommentarySpider,args=['-s','JOBDIR=crawls/CommentarySpider-1'])
# process.crawl(jqkaCommentarySpider.jqkaCommentarySpider,args=['-s','JOBDIR=crawls/CommentarySpider-2'])
# process.crawl(sinaCommentarySpider.sinaCommentarySpider,args=['-s','JOBDIR=crawls/CommentarySpider-3'])
# process.start()
# scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'CommentarySpider','-s','JOBDIR=crawls/CommentarySpider-1'])
scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'jqkaCommentarySpider','-s','JOBDIR=crawls/CommentarySpider-2'])
# scrapy.cmdline.execute(argv=['scrapy', 'crawl', 'sinaCommentarySpider','-s','JOBDIR=crawls/CommentarySpider-3'])
def runspider(name):
configure_logging(install_root_handler=False)
logging.basicConfig(
filename='log/%s.log' % name,
format='%(levelname)s %(asctime)s: %(message)s',
level=logging.DEBUG
)
process = CrawlerProcess(get_project_settings())
try:
logging.info('runspider start spider:%s' % name)
process.crawl(name)
process.start()
except Exception as e:
logging.exception('runspider spider:%s exception:%s' % (name, e))
logging.debug('finish this spider:%s\n\n' % name)
def load_crawler(self, crawler, url, ignore_regex):
"""
Loads the given crawler with the given url.
:param class crawler: class of the crawler to load
:param str url: url to start the crawler with
:param regex ignore_regex: to be able to ignore urls that match this
regex code
"""
self.process = CrawlerProcess(self.cfg.get_scrapy_options())
self.process.crawl(
crawler,
self.helper,
url=url,
config=self.cfg,
ignore_regex=ignore_regex)
def __init__(self, accounts, loglevel, remote=False):
self.accounts = settings.SCRAPY_ACCOUNTS
if accounts:
self.accounts.update(accounts)
self.loglevel = loglevel
self.settings = self._get_settings()
# Values for `loglevel`: CRITICAL, ERROR, WARNING, INFO, DEBUG.
self.settings.set('LOG_LEVEL', loglevel)
if remote:
# Configure remote logging and disable the scrapy logging.
self.settings.set('LOG_ENABLED', False)
logger = logging.getLogger()
handler = ScrapySocketHandler(
'localhost', logging.handlers.DEFAULT_TCP_LOGGING_PORT)
handler.setLevel(loglevel)
logger.addHandler(handler)
self.process = CrawlerProcess(self.settings)
def collect(conf, conn, date_from=None, date_to=None):
process = CrawlerProcess(conf['SCRAPY_SETTINGS'])
process.crawl(Spider, conn=conn, date_from=date_from, date_to=date_to)
process.start()
def fetch_url(cls, session, msites, platform_id, purpose):
"""Actual method to do fetch url action.
Parameters
----------
msites : list
a list of Site model class, contains info to build spiders.
platform_id : int
id of platform, bind fetched url with this id.
purpose : {'update', 'archive'}
indicate which url to fetch.
"""
settings = Settings(cls.conf['crawl']['scrapy'])
settings.set('ITEM_PIPELINES',
{'hoaxy.crawl.pipelines.UrlPipeline': 300})
process = CrawlerProcess(settings)
sll = cls.conf['logging']['loggers']['scrapy']['level']
logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
for ms in msites:
for sm in build_spiders_iter(ms, purpose):
sm['kwargs']['session'] = session
sm['kwargs']['platform_id'] = platform_id
process.crawl(sm['cls'], *sm['args'], **sm['kwargs'])
process.start()
def fetch_html(cls, session, url_tuples):
"""Actual method to do fetch html action.
Parameters
----------
session : object
a SQLAlchemy session object.
url_tuples : list
a list of url tuple (id, raw, status_code).
"""
settings = Settings(cls.conf['crawl']['scrapy'])
settings.set('ITEM_PIPELINES',
{'hoaxy.crawl.pipelines.HtmlPipeline': 300})
process = CrawlerProcess(settings)
sll = cls.conf['logging']['loggers']['scrapy']['level']
logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
logger.warning('Number of url to fetch html is: %s', len(url_tuples))
process.crawl(
HtmlSpider,
session=session,
url_tuples=url_tuples,
excluded_domains=cls.conf['crawl']['excluded_domains'])
process.start()
def parse_article(cls, session, url_tuples):
"""Actual method to do parse to article action.
Parameters
----------
session : object
a SQLAlchemy session object.
url_tuples : list
a list of url tuple (id, created_at, date_published,
canonical, site_id)
"""
settings = Settings(cls.conf['crawl']['scrapy'])
settings.set('ITEM_PIPELINES',
{'hoaxy.crawl.pipelines.ArticlePipeline': 300})
process = CrawlerProcess(settings)
sll = cls.conf['logging']['loggers']['scrapy']['level']
logging.getLogger('scrapy').setLevel(logging.getLevelName(sll))
logger.info('Number of url to parse is: %s', len(url_tuples))
process.crawl(
ArticleParserSpider,
session=session,
url_tuples=url_tuples,
api_key=cls.conf['crawl']['article_parser']['webparser_api_key'],)
process.start()
def main():
"""Rutina principal para la ejecución del Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print "Item extracted:", item
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# definir el spider para el crawler
crawler.crawl(EuropythonSpyder())
# iniciar scrapy
print "STARTING ENGINE"
crawler.start() #iniciar el crawler llamando al spider definido
print "ENGINE STOPPED"
def main():
"""Rutina principal para la ejecución del Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print "Item Extraido:", item
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# definir el spider para el crawler
crawler.crawl(BloggerSpider())
# iniciar scrapy
print "STARTING ENGINE"
crawler.start() #iniciar el crawler llamando al spider definido
print "ENGINE STOPPED"
def main():
from scrapy.xlib.pydispatch import dispatcher
"""Rutina principal para la ejecución del Spider"""
# set up signal to catch items scraped
def catch_item(sender, item, **kwargs):
print "Item extracted:", item
dispatcher.connect(catch_item, signal=signals.item_passed)
settings = Settings()
settings.set("USER_AGENT", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36")
settings.set("LOG_ENABLED",False)
# setup crawler
from scrapy.crawler import CrawlerProcess
crawler = CrawlerProcess(settings)
# define spyder for the crawler
crawler.crawl(PydataSpiderDetails())
print "STARTING ENGINE"
crawler.start() #start the crawler
print "ENGINE STOPPED"
def ProcessRun():
process = CrawlerProcess(get_project_settings())
# ????spider
process.crawl("news")
# process.crawl("favorite_spider")
# ???? spider
for spider_name in process.spider_loader.list():
# print spider_name
process.crawl(spider_name)
process.start()
def run(max_page=5):
settings = get_project_settings()
settings.set('MAX_PAGE', max_page, 'project')
crawler_process = CrawlerProcess(settings)
crawler_process.crawl(CaoLiuSpider)
crawler_process.start()
def __init__(self):
self.crawler = CrawlerProcess(settings)
def crawl(spider_name, results_dir):
""" Run one or more spiders """
settings = get_project_settings()
# prevent scrapy from configuring its own logging, since we already have it
settings.set('LOG_ENABLED', False)
process = CrawlerProcess(settings)
for s in spider_name:
process.settings.set('FEED_URI',
'file://%s.jsonlines' % os.path.join(results_dir, s))
process.settings.set('FEED_FORMAT', 'jsonlines')
spider = process.spider_loader.load(s)
process.crawl(spider)
process.start()
def main():
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl("pixiv")
process.start()
def crawl(spiders, query, start, end, page):
spider_logger.info("Start crawling {0} from {1} to {2}".format(query, start, end))
process = CrawlerProcess(get_project_settings())
process.crawl(spiders, query=query, start_time=start, end_time=end, index_pages=page)
process.start()
def start_gui(process):
"""
A function that takes care of starting the GUI and stops the Scrapy crawler process when exited from program.
:param CrawlerProcess process: The scrapy crawler process that is used to scrape the web. The instance is used for stopping the process.
"""
def create_ui(screen):
"""
A function passes to curses wrapper for safe execution of terminal GUI.
:param screen: The screen parameter to run the GUI. Sent from the curses wrapper.
"""
GUI.screen = screen # All the statis variables of the GUI class is initialized
GUI.strings = [] # the list of songs is empty initially
GUI.init_display() # init the variables required for GUI
GUI.update_on_key() # Starts a loop that waits for key input and acts accordingly
curses.nocbreak()
curses.echo()
curses.endwin()
GUI.gui_stopped = True
curses.wrapper(create_ui)
process.stop() # Stopping the scrapy crawler process
def startCrawler():
""" Initiates process of the web crawler above.
Arguments: None
Return: None
"""
# Starts a Twisted reactors to configure logs and set shutdown handlers
process = CrawlerProcess({
'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})
process.crawl(TwitterSpider)
process.start()
def list():
"""List all available spiders."""
settings = get_project_settings()
settings['LOG_ENABLED'] = False
process = CrawlerProcess(settings)
for s in sorted(process.spider_loader.list()):
print(s)
def main():
from scrapy.crawler import CrawlerProcess, Crawler
process = CrawlerProcess()
process.crawl(EducatieSpider)
process.start()
def main():
process = CrawlerProcess()
process.crawl(ComunicatiiSpider)
process.start()
def main():
from scrapy.crawler import CrawlerProcess
process = CrawlerProcess()
process.crawl(DialogSpider)
process.start()