def run():
configure_logging()
# importing project settings for further usage
# mainly because of the middlewares
settings = get_project_settings()
runner = CrawlerRunner(settings)
# running spiders sequentially (non-distributed)
@defer.inlineCallbacks
def crawl():
yield runner.crawl(IPTesterSpider)
yield runner.crawl(UATesterSpider)
reactor.stop()
crawl()
reactor.run() # block until the last call
python类get_project_settings()的实例源码
def runspider(name):
configure_logging(install_root_handler=False)
logging.basicConfig(
filename='log/%s.log' % name,
format='%(levelname)s %(asctime)s: %(message)s',
level=logging.DEBUG
)
process = CrawlerProcess(get_project_settings())
try:
logging.info('runspider start spider:%s' % name)
process.crawl(name)
process.start()
except Exception as e:
logging.exception('runspider spider:%s exception:%s' % (name, e))
logging.debug('finish this spider:%s\n\n' % name)
def start_requests(self):
settings = get_project_settings()
city_list = settings["CITY_LIST"]
if self.city:
city_cn_name = city_list.get(self.city)
yield scrapy.FormRequest(
url=self.base_url + self.city + "_gongyu",
formdata={"startDate": self.start_date, "endDate": self.end_date},
callback=self.parse,
meta={'city_en_name': self.city, "city_cn_name": city_cn_name}
)
else:
for city_en_name, city_cn_name in city_list.items():
yield scrapy.FormRequest(
url=self.base_url + city_en_name + "_gongyu",
formdata={"startDate": self.start_date, "endDate": self.end_date},
callback=self.parse,
meta={'city_en_name': city_en_name, "city_cn_name": city_cn_name}
)
def crawl(spider, *args, **kwargs):
"""Run a spider.
Args:
spider (str): The Scrapy `name` of the spider.
"""
settings = get_project_settings()
if kwargs.get('ignore_robots_txt') is True:
settings.attributes.get('ROBOTSTXT_OBEY').value = False
proc = CrawlerProcess(settings)
try:
proc.crawl(spider, *args, **kwargs)
proc.start()
except KeyError as err:
# Log a warning if the scraper name is invalid instead of
# causing the job to fail.
# NOTE: If there is any other type of error, the job will fail, and all
# the jobs that depend on it will fail as well.
logger.warning(err.args[0])
def get_feeds_settings(file_=None):
if file_:
logger.debug('Parsing configuration file {} ...'.format(file_.name))
# Parse configuration file and store result under FEEDS_CONFIG of
# scrapy's settings API.
parser = configparser.ConfigParser()
parser.read_file(file_)
config = {s: dict(parser.items(s)) for s in parser.sections()}
else:
config = {}
settings = get_project_settings()
settings.set('FEEDS_CONFIG', config)
# Mapping of feeds config section to setting names.
for settings_key, config_key in FEEDS_CFGFILE_MAPPING.items():
config_value = config.get('feeds', {}).get(config_key)
if config_value:
settings.set(settings_key, config_value)
return settings
def runspider(name, product_id):
configure_logging(install_root_handler = False)
logging.basicConfig(
filename = 'log/%s.log' % product_id,
format = '%(levelname)s %(asctime)s: %(message)s',
level = logging.DEBUG
)
process = CrawlerProcess(get_project_settings())
try:
logging.info('runscrapy start spider:%s' % name)
data = {
'product_id': product_id
}
process.crawl(name, **data)
process.start()
except Exception, e:
logging.error('runscrapy spider:%s exception:%s' % (name, e))
pass
logging.info('finish this spider:%s\n\n' % name)
def link_parse(self, response):
deeps = get_project_settings()['SPIDER_DEEP']
# ??????????????
links = response.xpath("//li[@class='hideli']/a/@href").extract()
if len(links) == 0:
yield self.parse_content(response)
else:
for link_item in links:
yield Request(DOMAIN + link_item, callback=self.parse_content)
# ??????
link_page = response.xpath("//a[@class='page_a']/@href").extract()
print "link_page:", link_page
for page_item in link_page:
page_id_list = page_item.split("&p=")
this_page_list = response.url.split("&p=")
this_index = 1
if len(this_page_list) == 2:
this_index = this_page_list[-1]
if len(page_id_list) == 2 and int(this_index) < int(page_id_list[-1]) < deeps:
print page_item
yield Request(DOMAIN + "/news" + page_item, callback=self.link_parse)
def link_parse(self, response):
deeps = get_project_settings()['SPIDER_DEEP']
# ???????????
links = response.xpath("//li[@class='pbox clr']/div[@class='word']/a/@href").extract()
if len(links) > 0:
for link in links:
yield Request(DOMAIN + link, callback=self.parse_content)
page_url = response.url
page_size = page_url.split("page_")
# ???size=2???????
if len(page_size) == 2:
page_index = page_url.split("page_")[1].replace('.html', '')
if 1 < int(page_index) < deeps:
yield Request(page_url, callback=self.link_parse)
# ?????????
def link_parse(self, response):
deeps = get_project_settings()['SPIDER_DEEP']
# ??????????????
links = response.xpath("//li[@class='hideli']/a/@href").extract()
if len(links) == 0:
yield self.parse_content(response)
else:
for link_item in links:
yield Request(DOMAIN + link_item, callback=self.parse_content)
# ??????
link_page = response.xpath("//a[@class='page_a']/@href").extract()
print "link_page:", link_page
for page_item in link_page:
page_id_list = page_item.split("&p=")
this_page_list = response.url.split("&p=")
this_index = 1
if len(this_page_list) == 2:
this_index = this_page_list[-1]
if len(page_id_list) == 2 and int(this_index) < int(page_id_list[-1]) < deeps:
print page_item
yield Request(DOMAIN + "/news" + page_item, callback=self.link_parse)
def link_parse(self, response):
deeps = get_project_settings()['SPIDER_DEEP']
# ??????????????
links = response.xpath("//article/a/@href").extract()
if len(links) == 0:
yield self.parse_content(response)
else:
for link_item in links:
yield Request(DOMAIN + link_item, callback=self.parse_content)
# ??????
link_page = response.xpath("//div[@class='pagination']/ul/li/a/@href").extract()
print "link_page:", link_page
for page_item in link_page:
page_id_list = page_item.split("_")
this_page_list = response.url.split("_")
this_index = 1
if len(this_page_list) == 3:
this_index = this_page_list[-1].replace('.html', '')
if len(page_id_list) == 3 and int(this_index) < int(page_id_list[-1].replace('.html', '')) < deeps:
print page_item
yield Request(DOMAIN + page_item, callback=self.link_parse)
def link_parse(self, response):
deeps = get_project_settings()['SPIDER_DEEP']
# ??????????????
links = response.xpath("//li[@class='itm itm_new']/a/@href").extract()
if len(links) == 0:
yield self.parse_content(response)
else:
for link_item in links:
yield Request(DOMAIN + link_item, callback=self.parse_content)
# ??????
link_page = response.xpath("//li[@class='itm itm_new']/span/a/@href").extract()
print "link_page:", link_page
for page_item in link_page:
page_id_list = page_item.split("pg=")
this_page_list = response.url.split("pg=")
this_index = 1
if len(this_page_list) == 2:
this_index = this_page_list[-1]
if len(page_id_list) == 2 and int(this_index) < int(page_id_list[-1]) < deeps:
print page_item
yield Request(page_item, callback=self.link_parse)
# ?????????
def link_parse(self, response):
deeps = get_project_settings()['SPIDER_DEEP']
# ??????????????
links = response.xpath("//article/header/h1/a/@href").extract()
if len(links) == 0:
yield self.parse_content(response)
else:
for link_item in links:
yield Request(link_item, callback=self.parse_content)
# ??????
link_page = response.xpath("//div[@class='nav-previous']/a/@href").extract()
print "link_page:", link_page
for page_item in link_page:
page_id_list = page_item.split("page/")
this_page_list = response.url.split("page/")
this_index = 1
if len(this_page_list) == 2:
this_index = this_page_list[-1]
if len(page_id_list) == 2 and int(this_index) < int(page_id_list[-1]) < deeps:
print page_item
yield Request(this_index, callback=self.link_parse)
def runspider(name):
configure_logging(install_root_handler=False)
logging.basicConfig(
filename='log/%s.log' % name,
format='%(levelname)s %(asctime)s: %(message)s',
level=logging.DEBUG
)
process = CrawlerProcess(get_project_settings())
try:
logging.info('runspider start spider:%s' % name)
process.crawl(name)
process.start()
except Exception as e:
logging.exception('runspider spider:%s exception:%s' % (name, e))
logging.debug('finish this spider:%s\n\n' % name)
def get_proxy(self):
if get_project_settings().get('IS_USE_PROXY', True):
if len(self.proxys) <= 10:
self.update_proxy()
if len(self.proxys) > 0:
self.index = self.index + 1
self.index = self.index % len(self.proxys)
proxy = 'http://%s:%s' % (self.proxys[self.index].get('ip'), self.proxys[self.index].get('port'))
utils.log('++++++++++proxy:%s++++++++++' % proxy)
return proxy
return None
else:
return None
def ProcessRun():
process = CrawlerProcess(get_project_settings())
# ????spider
process.crawl("news")
# process.crawl("favorite_spider")
# ???? spider
for spider_name in process.spider_loader.list():
# print spider_name
process.crawl(spider_name)
process.start()
def __init__(self):
settings = get_project_settings()
self.__class__.postfix = settings.get('POSTFIX')
def __init__(self):
# settings = get_project_settings()
# self.__class__.sqlite_name = settings.get('sqlite_name')
# self.conn = sqlite3.connect(str(self.__class__.sqlite_name))
self.conn = sqlite3.connect('sample.db')
def connectSQLite():
# settings = get_project_settings()
# sqlite_name = settings.get('sqlite_name')
# conn = sqlite3.connect(str(sqlite_name))
conn = sqlite3.connect('sample.db')
return conn
def run(max_page=5):
settings = get_project_settings()
settings.set('MAX_PAGE', max_page, 'project')
crawler_process = CrawlerProcess(settings)
crawler_process.crawl(CaoLiuSpider)
crawler_process.start()
def get_settings():
settings = get_project_settings()
LOG_PATH = settings['LOG']
if not os.path.exists(LOG_PATH):
os.makedirs(LOG_PATH)
LOG_FILE = os.path.join(LOG_PATH, str(date.today()))
if not os.path.exists(LOG_FILE):
f = open(LOG_FILE, 'w')
f.close()
settings.set('LOG_FILE', LOG_FILE)
return settings
def start_requests(self):
settings = get_project_settings()
city_list = settings["CITY_LIST"]
if self.city:
city_cn_name = city_list.get(self.city)
yield scrapy.Request(
url=self.format_url(city_cn_name, '0'),
callback=self.parse,
meta={
'city_en_name': self.city,
"city_cn_name": city_cn_name,
"current_offset": '0',
"handle_httpstatus_list": [400, 500, 404]
},
)
else:
for city_en_name, city_cn_name in city_list.items():
yield scrapy.Request(
url=self.format_url(city_cn_name, '0'),
callback=self.parse,
meta={
'city_en_name': city_en_name,
"city_cn_name": city_cn_name,
"current_offset": '0',
"handle_httpstatus_list": [400, 500, 404]
},
)
def crawl(spider_name, results_dir):
""" Run one or more spiders """
settings = get_project_settings()
# prevent scrapy from configuring its own logging, since we already have it
settings.set('LOG_ENABLED', False)
process = CrawlerProcess(settings)
for s in spider_name:
process.settings.set('FEED_URI',
'file://%s.jsonlines' % os.path.join(results_dir, s))
process.settings.set('FEED_FORMAT', 'jsonlines')
spider = process.spider_loader.load(s)
process.crawl(spider)
process.start()
def __init__(self, keyword, oneof=u'', exclude=u'', max_page=0, save_star=500, save_thumbs=True, save_dir=u'big', *args, **kwargs):
super(ImageCrawler, self).__init__(*args, **kwargs)
settings = get_project_settings()
self.pixiv_id = settings['PIXIV_ID']
self.pixiv_pass = settings['PIXIV_PASS']
self.max_page = int(max_page)
print keyword
if platform.system() == 'Windows':
self.keyword = keyword.decode('gbk').replace('##', ' ')
if oneof is not None and oneof != u'':
self.keyword += u' (' + oneof.decode('gbk').replace('##', ' OR ') + u')'
if exclude is not None and exclude != u'':
excludes = exclude.split('##')
for excl in excludes:
self.keyword += u' -' + excl.decode('gbk')
self.img_save_dir = save_dir.decode('gbk')
else:
self.keyword = keyword.replace('##', ' ')
if oneof is not None and oneof != u'':
self.keyword += u' (' + oneof.replace('##', ' OR ') + u')'
if exclude is not None and exclude != u'':
excludes = exclude.split('##')
for excl in excludes:
self.keyword += u' -' + excl
self.img_save_dir = save_dir
self.save_star = int(save_star)
self.save_thumbs = save_thumbs == 'True' or save_thumbs == True
print self.keyword, self.max_page, self.save_star, self.save_thumbs
def create_table(self):
conn = sqlite3.connect(get_project_settings()['DATABASE_POSITION'])
conn.text_factory = str
try:
conn.execute(
"""CREATE TABLE pixiv_item(
id INTEGER PRIMARY KEY,
title TEXT, link TEXT,
star INTEGER, multi INTEGER,
keyword TEXT, publish TIMESTAMP)""")
conn.commit()
except:
pass
return conn
def main():
settings = get_project_settings()
process = CrawlerProcess(settings)
process.crawl("pixiv")
process.start()
def read_star():
db = sqlite3.connect(get_project_settings()['DATABASE_POSITION'])
db.row_factory = dict_factory
cursor = db.cursor()
cursor.execute('select * from pixiv_item where star is not null ORDER BY -star')
global star_array, unstar
star_array = cursor.fetchall()
def __init__(self, config):
self.settings = get_project_settings()
self.settings.set('DOWNLOAD_MAXSIZE', config.get('max_file_size', 1024 * 1024 * 2))
self.downloader = HTTPDownloadHandler(self.settings)
self.proxies = {}
self.valid_extensions = config.getlist('file_valid_extensions', "jpg, png")
_proxies = config.items('proxy', ())
for proxy_type, proxy in _proxies:
self.proxies[proxy_type] = get_proxy(proxy, proxy_type)
def crawl(spiders, query, start, end, page):
spider_logger.info("Start crawling {0} from {1} to {2}".format(query, start, end))
process = CrawlerProcess(get_project_settings())
process.crawl(spiders, query=query, start_time=start, end_time=end, index_pages=page)
process.start()
def run(cls):
runner = CrawlerRunner(get_project_settings())
@defer.inlineCallbacks
def deferred_crawl():
for spider, args, kwargs in cls.queue:
try:
yield runner.crawl(spider, *args, **kwargs)
except KeyError as err:
# Log a warning if the scraper name is invalid instead of
# causing the job to fail.
# NOTE: If there is any other type of error, the job will
# fail, and all the jobs that depend on it will fail as
# well.
logger.warning(err.args[0])
# XXX: If all the names fail, then trying to run
# `reactor.stop()` will give an "Unhandled error in
# Deferred" complaint and hang. It will also hang in
# general if no spiders have been run. I assume there's
# some twisted-way to handle this, but for now, just log an
# error.
if reactor.running:
reactor.stop()
else:
logger.critical("LocalQueue: No valid scraper names found.")
deferred_crawl()
reactor.run()
def __init__(self):
self.path = self._script_path()
try:
self.settings = project.get_project_settings() # get settings
self.configPath = self.settings.get("RESOURCE_DIR")
except:
pass
if 'configPath' in self.__dict__:
self.path = self.configPath