def __init__(self):
self.settings = project.get_project_settings() # get settings
self.MYSQL_HOST = self.settings.get('MYSQL_HOST')
self.MYSQL_PORT = self.settings.getint('MYSQL_PORT')
self.MYSQL_USER = self.settings.get('MYSQL_USER')
self.MYSQL_PASSWD = self.settings.get('MYSQL_PASSWD')
self.MYSQL_DB = self.settings.get('MYSQL_DB')
self._conn()
python类get_project_settings()的实例源码
def __init__(self, collection_name):
self.settings = project.get_project_settings() # get settings
self.MONGO_URL = self.settings.get("MONGO_URL","localhost")
self.client = MongoClient(
host=self.mongo_url, tz_aware=True)
self.db = self.client['crawl_db']
self.posts = self.db[collection_name]
def __init__(self):
print 'preparing ------------------'
self.start_page = 1
self.modelUrl = 'https://www.lagou.com/jobs/'
self.DBK = get_project_settings().getdict('DBK') #??settings??DBK??
hp.NEWHTTPS() #????IP?
self.oldPages = self.getOldpages() #?????????
def __init__(self):
self.DBK = get_project_settings().getdict('DBK')
def list():
"""List all available spiders."""
settings = get_project_settings()
settings['LOG_ENABLED'] = False
process = CrawlerProcess(settings)
for s in sorted(process.spider_loader.list()):
print(s)
def run(self, args, opts):
conn = redis.Redis(decode_responses=True)
runner = CrawlerRunner(get_project_settings())
try:
rules = Rule.loads()
if not rules:
raise ValueError
except ValueError:
print('Error in loading Redis rules, fallback to CSV rules')
rules = Rule.loads('csv')
for rule in rules:
rule.save()
if rule.name in self.excludes:
continue
if conn.hget('Rule:' + rule.name, 'status') == 'started':
d = runner.crawl(ProxySpider, rule)
# Set status to stopped if crawler finished
d.addBoth(lambda _: conn.hset(
'Rule:' + rule.name, 'status', 'finished'))
rule_maintainer = RuleMaintainer(conn, runner)
proxy_maintainer = ProxyMaintainer(conn)
schedule_maintainer = ScheduleMaintainer(conn)
lc = task.LoopingCall(rule_maintainer)
lc.start(1)
lc = task.LoopingCall(proxy_maintainer)
lc.start(0.5)
lc = task.LoopingCall(schedule_maintainer)
lc.start(10)
reactor.run()
def __init__(self):
self.settings=get_project_settings() #??settings??????????
self.host=self.settings['MYSQL_HOST']
self.port=self.settings['MYSQL_PORT']
self.user=self.settings['MYSQL_USER']
self.passwd=self.settings['MYSQL_PASSWD']
self.db=self.settings['MYSQL_DBNAME']
def __init__(self):
self.is_running = False
dispatcher.connect(self.pause_crawler, signals.engine_stopped)
self.setting = get_project_settings()
self.process = None
def __init__(self):
self.is_running = False
dispatcher.connect(self.pause_crawler, signals.engine_stopped)
self.setting = get_project_settings()
self.process = None
def getReviewCount(url):
# Get the number of reviews
process = CrawlerProcess(get_project_settings())
process.crawl(review_count_spider, start_url=url)
process.start()
def __new__(cls, *args, **kwargs):
start_list = ['http://www.coindesk.com/category/news/']
i = 2
deeps = get_project_settings()['SPIDER_DEEP']
while i < deeps:
start_list.append('http://www.coindesk.com/category/news/page/' + bytes(i) + "/")
i += 1
CoinDesk.start_urls = start_list
print CoinDesk.start_urls
return super(CoinDesk, cls).__new__(cls, *args, **kwargs)
def __init__(self):
self.settings=get_project_settings()
self.host=self.settings['MYSQL_HOST']
self.port=self.settings['MYSQL_PORT']
self.user=self.settings['MYSQL_USER']
self.passwd=self.settings['MYSQL_PASSWD']
self.db=self.settings['MYSQL_DBNAME']
def import_settings(self):
settings = get_project_settings()
self.password = settings['AUTH_PASSWORD']
self.http_proxy = settings['HTTP_PROXY']
self.control_port = settings['CONTROL_PORT']
self.max_req_per_ip = settings['MAX_REQ_PER_IP']
def runTest(self):
settings = get_project_settings()
settings.set('SPIDER_MODULES', ['classes.spiders'])
try:
sys.path.append(scrapy_path)
runner = CrawlerRunner(settings)
spiders = runner.spider_loader.list()
self.assertEqual(set(class_pipeline.get_spiders()), set(spiders))
except:
pass
def __init__(self):
# getting the settings of the project (settings.py)
self.settings = get_project_settings()
# processing input arguments
self.process_args()
# meeting the arguments with the settings
self.change_settings()
# open mongo here just to check if mongod service is running
# if it isn't, might as well not start crawling
if self.args.file == None:
self.open_mongo()
self.dump_collection()
# running the spiders
self.run_crawler()
if self.args.file:
self.sort_file()
else:
if self.args.server == False:
# working with the mongo db
self.sort()
# close mongo
self.close_mongo()
def __init__(self):
scrapy.spiders.Spider.__init__(self)
self.global_settings = get_project_settings()
if self.global_settings['PLATFORM'] in ['win', 'mac']:
self.driver = webdriver.PhantomJS(executable_path= self.global_settings['PHANTOMJS_PATH'])
elif self.global_settings['PLATFORM'] in ['linux']:
self.driver = webdriver.PhantomJS()
self.driver.set_page_load_timeout(30)
self.driver.implicitly_wait(10)
self.type_id_list = self.global_settings['CRAWLER']['type_id_list']
self.re_type_id = re.compile(self.global_settings['CRAWLER']['re_type_id'])
self.url_template = self.global_settings['CRAWLER']['url_template']
def _run_spiders(ticker_list, start_date, end_date):
configure_logging()
runner = CrawlerRunner(settings=get_project_settings())
spider_dict = {
'symbols': ticker_list,
'start_date': start_date,
'end_date': end_date
}
runner.crawl(EdgarSpider, **spider_dict)
d = runner.join()
d.addBoth(lambda _: reactor.stop())
reactor.run()
def crawl(url):
"""Initialize crawling sequence."""
settings = get_project_settings()
settings.url = url
settings["CLOSESPIDER_PAGECOUNT"] = CRAWL_COUNT
settings["DEPTH_LEVEL"] = DEPTH_LEVEL
process = CrawlerProcess(settings)
class ThisSpider(CrawlingSpider):
"""Create a spider to crawl with."""
start_urls = [url]
process.crawl(ThisSpider)
process.start()
def harvest(url):
"""Initialize harvest sequence."""
settings = get_project_settings()
settings.url = url
process = CrawlerProcess(settings)
process.crawl(HarvestSpider, url=url)
process.start()
def __init__(self, **kwargs):
settings = get_project_settings()
self.create_display()
self.load_proxy_list()
self.get_item_seen(settings)