def index_job(link) :
"""
Index a single page.
"""
print("index page : %s"%link)
# get final url after possible redictions
try :
link = url.crawl(link).url
except :
return 0
process = CrawlerProcess({
'USER_AGENT': "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36",
'DOWNLOAD_TIMEOUT':100,
'REDIRECT_ENABLED':False,
'SPIDER_MIDDLEWARES' : {
'scrapy.spidermiddlewares.httperror.HttpErrorMiddleware':True
}
})
process.crawl(crawler.SingleSpider, start_urls=[link,], es_client=client, redis_conn=redis_conn)
process.start() # block until finished
python类Index()的实例源码
def applyConfig(self):
try:
print("Connecting to '%s', index '%s'" % (self.confESHost, self.confESIndex))
self.es = connections.create_connection(hosts=[self.confESHost])
self.idx = Index(self.confESIndex)
self.idx.doc_type(DocHTTPRequestResponse)
if self.idx.exists():
self.idx.open()
else:
self.idx.create()
self.callbacks.saveExtensionSetting("elasticburp.host", self.confESHost)
self.callbacks.saveExtensionSetting("elasticburp.index", self.confESIndex)
self.callbacks.saveExtensionSetting("elasticburp.tools", str(self.confBurpTools))
self.callbacks.saveExtensionSetting("elasticburp.onlyresp", str(int(self.confBurpOnlyResp)))
except Exception as e:
JOptionPane.showMessageDialog(self.panel, "<html><p style='width: 300px'>Error while initializing ElasticSearch: %s</p></html>" % (str(e)), "Error", JOptionPane.ERROR_MESSAGE)
### ITab ###
def execute(self):
idx = es.Index(self.index)
try:
idx.delete(ignore=404)
log.info("Deleted index %s", self.index)
except AttributeError:
log.warning("Could not delete index '%s', ignoring", self.index)
except NotFoundError:
log.warning("Could not delete index '%s', ignoring", self.index)
# create doc types
for dt in self.doc_types:
idx.doc_type(dt)
# create index
idx.create()
def _synchronise_index(self, sql_table_cls, es_doc_cls, id_logger):
es_doc = es_doc_cls()
self._logging(logging.INFO,
'Synchronizing %s index.' % es_doc.index)
with acquire_inter_process_lock('sync_%s' % es_doc.index) as acquired:
if not acquired:
es_doc = es_doc_cls()
err_msg = 'Another process is already synchronizing the %s ' \
'index, aborting now.' % es_doc.index
self._logging(logging.WARNING, err_msg)
else:
self._perform_index_sync(sql_table_cls, es_doc_cls, id_logger)
self._logging(logging.INFO,
'Index %s is now synchronized.' % es_doc.index)
def _perform_geocomplete_index_population(self, max_doc):
elasticsearch_conn = connections.get_connection()
to_index = list()
for i, document in enumerate(self._geocompletion_documents()):
if i % max_doc == 0:
log_msg = 'Computing required geoloc-entry documents.'
self._logging(logging.INFO, log_msg)
to_index.append(document.to_dict(True))
if len(to_index) < max_doc:
continue
self._geocomplete_index_batch(elasticsearch_conn, to_index)
to_index = list()
if len(to_index) != 0:
self._geocomplete_index_batch(elasticsearch_conn, to_index)
elasticsearch_dsl.Index('geocomplete').refresh()
def _perform_index_purge(self, index_name, index_settings, doc_type_class):
log_msg = 'Dropping %s index.' % index_name
self._logging(logging.INFO, log_msg)
index = elasticsearch_dsl.Index(index_name)
index.settings(**index_settings)
index.doc_type(doc_type_class)
try:
index.delete(ignore=404)
index.create()
except elasticsearch.exceptions.ElasticsearchException as e:
log_msg = 'Error while dropping %s index: %s.' % (index_name, e)
self._logging(logging.ERROR, log_msg)
return
log_msg = 'Index %s has been dropped successfully.' % index_name
self._logging(logging.INFO, log_msg)
def index():
"""
URL : /index
Index a new URL in search engine.
Method : POST
Form data :
- url : the url to index [string, required]
Return a success message.
"""
# get POST data
data = dict((key, request.form.get(key)) for key in request.form.keys())
if "url" not in data :
raise InvalidUsage('No url specified in POST data')
# launch exploration job
index_job.delay(data["url"])
return "Indexing started"
def _create_index(self):
dt = datetime.utcnow()
dt = dt.strftime('%Y.%m')
es = connections.get_connection()
if not es.indices.exists('indicators-{}'.format(dt)):
index = Index('indicators-{}'.format(dt))
index.aliases(live={})
index.doc_type(Indicator)
index.create()
m = Mapping('indicator')
m.field('indicator_ipv4', 'ip')
m.field('indicator_ipv4_mask', 'integer')
m.save('indicators-{}'.format(dt))
return 'indicators-{}'.format(dt)
def mitm_request(self, data):
# Initialize ES connection and index
res = connections.create_connection(hosts=[args.elasticsearch])
idx = Index(args.index)
idx.doc_type(DocHTTPRequestResponse)
try:
DocHTTPRequestResponse.init()
idx.create()
except:
pass
r = HTTPRequest(data)
# determine url
if self.is_connect:
scheme = "https"
else:
scheme = "http"
url = scheme + "://" + self.hostname
if scheme == "http" and int(self.port) != 80 or scheme == "https" and int(self.port) != 443:
url += ":" + str(self.port)
url += self.path
if args.verbose:
print(url)
self.doc = DocHTTPRequestResponse(host=self.hostname, port=int(self.port), protocol=scheme)
self.doc.meta.index = args.index
self.doc.request.url = url
self.doc.request.requestline = r.requestline
self.doc.request.method = r.command
self.doc.host = self.hostname
self.doc.port = int(self.port)
self.doc.protocol = scheme
return data
def createMenuItems(self, invocation):
menuItems = list()
selectedMsgs = invocation.getSelectedMessages()
if selectedMsgs != None and len(selectedMsgs) >= 1:
menuItems.append(JMenuItem("Add to ElasticSearch Index", actionPerformed=self.genAddToES(selectedMsgs, invocation.getInputEvent().getComponent())))
return menuItems
def migrate():
hidden_services = Index('hiddenservices')
hidden_services.delete(ignore=404)
hidden_services = Index('hiddenservices')
hidden_services.doc_type(DomainDocType)
hidden_services.doc_type(PageDocType)
hidden_services.settings(number_of_shards=8, number_of_replicas=1)
hidden_services.create()
def tearDown(self):
index = Index(settings.ELASTICSEARCH_INDEX)
index.delete(ignore=404)
def _index_img(self, img):
"""Index a single img and ensure that it's been propagated to the search engine"""
image = search.db_image_to_index(img)
image.save()
index = Index(name=settings.ELASTICSEARCH_INDEX)
index.flush(force=True)
index.refresh()
def index_all_images(self, chunk_size=DEFAULT_CHUNK_SIZE, num_iterations=DEFAULT_NUM_ITERATIONS,
num_threads=DEFAULT_NUM_THREADS):
"""Index every record in the database with a server-side cursor"""
index = Index(settings.ELASTICSEARCH_INDEX)
if not index.exists():
log.info("Creating new index %s", settings.ELASTICSEARCH_INDEX)
search.Image.init()
mapping = search.Image._doc_type.mapping
mapping.save(settings.ELASTICSEARCH_INDEX)
log.info("Done creating new index")
with Pool(num_threads) as pool:
starts = [i * chunk_size for i in range(0, num_iterations)]
pool.starmap(do_index, zip(starts, itertools.repeat(chunk_size, len(starts))))
def test_aggregation_without_events(app, es_with_templates):
"""Check that the aggregation doesn't crash if there are no events.
This scenario happens when celery starts aggregating but no events
have been created yet.
"""
# Aggregate events
StatAggregator(event='file-download',
aggregation_field='file_id',
aggregation_interval='day',
query_modifiers=[]).run()
assert not Index(
'stats-file-download', using=current_search_client
).exists()
# Create the index but without any event. This happens when the events
# have been indexed but are not yet searchable (before index refresh).
Index('events-stats-file-download-2017',
using=current_search_client).create()
# Wait for the index to be available
time.sleep(1)
# Aggregate events
StatAggregator(event='file-download',
aggregation_field='file_id',
aggregation_interval='day',
query_modifiers=[]).run()
assert not Index(
'stats-file-download', using=current_search_client
).exists()
def get_bookmark(self):
"""Get last aggregation date."""
if not Index(self.aggregation_alias,
using=self.client).exists():
if not Index(self.event_index,
using=self.client).exists():
return datetime.date.today()
return self._get_oldest_event_timestamp()
# retrieve the oldest bookmark
query_bookmark = Search(
using=self.client,
index=self.aggregation_alias,
doc_type='{0}-bookmark'.format(self.event)
)[0:1].sort(
{'date': {'order': 'desc'}}
)
bookmarks = query_bookmark.execute()
# if no bookmark is found but the index exist, the bookmark was somehow
# lost or never written, so restart from the beginning
if len(bookmarks) == 0:
return self._get_oldest_event_timestamp()
# change it to doc_id_suffix
bookmark = datetime.datetime.strptime(bookmarks[0].date,
self.doc_id_suffix)
return bookmark
def run(self):
"""Calculate statistics aggregations."""
# If no events have been indexed there is nothing to aggregate
if not Index(self.event_index, using=self.client).exists():
return
lower_limit = self.get_bookmark()
# Stop here if no bookmark could be estimated.
if lower_limit is None:
return
upper_limit = min(
datetime.datetime.utcnow().
replace(microsecond=0),
datetime.datetime.combine(lower_limit +
datetime.timedelta(self.batch_size),
datetime.datetime.min.time())
)
while upper_limit <= datetime.datetime.utcnow():
self.indices = set()
self.new_bookmark = upper_limit.strftime(self.doc_id_suffix)
bulk(self.client,
self.agg_iter(lower_limit, upper_limit),
stats_only=True,
chunk_size=50)
# Flush all indices which have been modified
current_search_client.indices.flush(
index=','.join(self.indices),
wait_if_ongoing=True
)
self.set_bookmark()
self.indices = set()
lower_limit = lower_limit + datetime.timedelta(self.batch_size)
upper_limit = min(datetime.datetime.utcnow().
replace(microsecond=0),
lower_limit +
datetime.timedelta(self.batch_size))
if lower_limit > upper_limit:
break
def execute(self):
"""
Index data of specified queryset
"""
client = elasticsearch.Elasticsearch(
hosts=settings.ELASTIC_SEARCH_HOSTS,
# sniff_on_start=True,
retry_on_timeout=True,
refresh=True
)
start_time = time.time()
duration = time.time()
loop_time = elapsed = duration - start_time
for batch_i, total_batches, start, end, total, qs in self.batch_qs():
loop_start = time.time()
total_left = ((total_batches - batch_i) * loop_time)
progres_msg = \
'%s of %s : %8s %8s %8s duration: %.2f left: %.2f' % (
batch_i, total_batches, start, end, total, elapsed,
total_left
)
log.debug(progres_msg)
helpers.bulk(
client, (self.convert(obj).to_dict(include_meta=True)
for obj in qs),
raise_on_error=True,
refresh=True
)
now = time.time()
elapsed = now - start_time
loop_time = now - loop_start
def _perform_index_sync(self, sql_table_cls, es_doc_cls, id_logger):
es_doc = es_doc_cls()
elasticsearch_conn = connections.get_connection()
sync_timestamp = current_server_timestamp()
pending_insertions = self._compute_dirty_documents(
sql_table_cls, es_doc.doc_type)
bulk_op = self._synchronisation_op(es_doc, pending_insertions)
self._logging(logging.INFO, 'Performing synchronization.')
for ok, info in parallel_bulk(elasticsearch_conn, bulk_op):
obj_id = info['index']['_id'] \
if 'index' in info else info['update']['_id']
if ok:
# Mark the task as handled so we don't retreat it next time
self._logging(logging.INFO,
'Document %s has been synced successfully.'
% obj_id)
sql_table_cls.update_last_sync(obj_id, sync_timestamp)
else:
id_logger(obj_id, logging.ERROR,
'Error while syncing document %s index.' % obj_id)
# Refresh indices to increase research speed
elasticsearch_dsl.Index(es_doc.index).refresh()
def create_indices(endpoint):
"""
Creates constituent and address indices in PIC
"""
connections.connections.create_connection(hosts=[endpoint], timeout=360, max_retries=10, retry_on_timeout=True)
pic_index = Index('pic')
pic_index.doc_type(Constituent)
pic_index.doc_type(Address)
pic_index.delete(ignore=404)
pic_index.settings(
number_of_shards=5,
number_of_replicas=2
)
pic_index.create()
def __init__(self, config_file='config.cfg'):
super(Elastic, self).__init__()
self.percentage=10.0
self.minimum_occurrences=250
# The ConfigParser documentation points out that there's no way to force defaults config option
# outside the "DEFAULT" section.
config = ConfigParser()
config.read(config_file)
if not config.has_section('elastic'):
config.add_section('elastic')
for option, value in {'use_ssl': 'True', 'host': '127.0.0.1', 'version': '2', 'index': 'nxapi', 'doc_type': 'events'}.items():
if not config.has_option('elastic', option):
config.set('elastic', option, value)
self.version = config.getint('elastic', 'version')
self.index = config.get('elastic', 'index')
use_ssl = config.getboolean('elastic', 'use_ssl')
host = config.get('elastic', 'host')
self.doc_type = config.get('elastic', 'doc_type')
self.client = connections.create_connection(hosts=[host], use_ssl=use_ssl, index=self.index, version=self.version, doc_type=self.doc_type, timeout=30, retry_on_timeout=True )
Event.init(index=self.index)
index = Index(self.index, using=self.client)
index.doc_type(Event)
self.initialize_search()
def handle(self, *args, **options):
if options['verbose']:
log.setLevel(logging.DEBUG)
es = search.init_es(timeout=2000)
oldindex = Index(options['oldindex'])
client = elasticsearch.client.IndicesClient(es)
# Create the new index
newindex = Index(options['newindex'])
newindex.doc_type(search.Image)
try:
newindex.create()
except elasticsearch.exceptions.RequestError as e:
if options['force']:
log.warn("Trying to delete previously-created new index %s", options['newindex'])
newindex.delete()
newindex.create()
else:
raise e
log.info("Done creating new index %s", options['newindex'])
log.info("Copying data on %s to %s", options['oldindex'], options['newindex'])
# Would love to use ES native reindex() but AWS's service doesn't support it :(
elasticsearch.helpers.reindex(es, options['oldindex'], options['newindex'])
# Wait for it to be happy
if not settings.DEBUG:
es.cluster.health(wait_for_status='green', request_timeout=2000)
# Is the value of 'oldindex' an alias or a real index?
if client.exists_alias(name=settings.ELASTICSEARCH_INDEX):
log.info("Confirmed that value of %s is an alias and not a real index" % options['oldindex'])
alias_move = """{
"actions" : [
{ "remove" : { "index" : "%s", "alias" : "%s" } },
{ "add" : { "index" : "%s", "alias" : "%s" } }
]
}""" % (options['oldindex'], settings.ELASTICSEARCH_INDEX, options['newindex'], settings.ELASTICSEARCH_INDEX)
client.update_aliases(alias_move)
elif client.exists(options['oldindex']):
log.info("%s is a real index and not an alias, fixing" % options['oldindex'])
# Delete the old index
log.info("Deleting %s -- this will cause some downtime", options['oldindex'])
oldindex.delete()
client.put_alias(options['newindex'], settings.ELASTICSEARCH_INDEX)
# Confirm number of documents in current settings
s = Search()
response = s.execute()
log.info("%d results available in %s" % (response.hits.total, settings.ELASTICSEARCH_INDEX))