def __init__(self, lte=None, gte=None, limit=250, sort='date_disseminated,ASC', fastout=False, verify=True, endpoint='http://127.0.0.1/', start_offset=0):
if gte and not lte:
lte = datetime.now().isoformat()
if lte and not gte:
gte = '2000-01-01'
self.lte = lte
self.gte = gte
self.limit = limit
self.sort = sort
self.fastout = fastout
self.verify = verify
self.endpoint = endpoint
self.fcc_endpoint = 'https://ecfsapi.fcc.gov/filings'
self.index_fields = mappings.FIELDS.keys()
self.es = Elasticsearch(self.endpoint, timeout=30)
self.start_offset = start_offset
self.stats = {'indexed': start_offset, 'fetched': start_offset}
python类Elasticsearch()的实例源码
def __init__(self, settings):
kwargs = settings.get('es').get('client')
es_user = settings.get('es_user')
es_pass = settings.get('es_pass')
if es_user and es_pass:
kwargs.update(**dict(http_auth=(es_user, es_pass)))
self.client = Elasticsearch(**kwargs)
self.timeout = settings.get('es').get('client').get('timeout')
self.doc_type = settings.get('es').get('doc_type')
self.index_name = settings.get('es').get('index')
self.id_field = settings.get('id_field')
self.bulk_size = settings.get('bulk_size', 1000)
self.path_encoding = settings.get('path_encoding')
self.actions = []
log.debug('ESStorer instance created: %s', self.client)
def run(args):
elasticsearchServer = args[0] if len(args) else 'localhost:9200'
indexName = 'nhs_conditions'
docType = 'condition'
es = Elasticsearch(elasticsearchServer)
es.indices.delete(index=indexName, ignore=[400,404])
f = open('nhsPageContent','w')
f.write('[')
for model in get_pages_info_models('http://www.nhs.uk/Conditions/Pages/hub.aspx'):
json = model.to_json()
es.index(index=indexName, doc_type=docType, body=json)
f.write(json + ",\n")
f.write(']')
f.close()
es.indices.refresh(index=indexName)
def get_summary_statistics():
"""
Obtains statistics about current sum of flows, packets, bytes.
:return: JSON with status "ok" or "error" and requested data.
"""
try:
# Elastic query
client = elasticsearch.Elasticsearch([{'host': myconf.get('consumer.hostname'), 'port': myconf.get('consumer.port')}])
elastic_bool = []
elastic_bool.append({'range': {'@timestamp': {'gte': "now-5m", 'lte': "now"}}})
elastic_bool.append({'term': {'@type': 'protocols_statistics'}})
qx = Q({'bool': {'must': elastic_bool}})
s = Search(using=client, index='_all').query(qx)
s.aggs.bucket('sum_of_flows', 'sum', field='flows')
s.aggs.bucket('sum_of_packets', 'sum', field='packets')
s.aggs.bucket('sum_of_bytes', 'sum', field='bytes')
s.sort('@timestamp')
result = s.execute()
# Result Parsing into CSV in format: timestamp, tcp protocol value, udp protocol value
data = "Timestamp, Flows, Packets, Bytes;"
timestamp = "Last 5 Minutes"
data += timestamp + ', ' +\
str(int(result.aggregations.sum_of_flows['value'])) + ', ' +\
str(int(result.aggregations.sum_of_packets['value'])) + ', ' +\
str(int(result.aggregations.sum_of_bytes['value']))
json_response = '{"status": "Ok", "data": "' + data + '"}'
return json_response
except Exception as e:
json_response = '{"status": "Error", "data": "Elasticsearch query exception: ' + escape(str(e)) + '"}'
return json_response
def setup_es(es_ip, es_port):
"""
Setup an Elasticsearch connection
Parameters
----------
es_ip: string
IP address for elasticsearch instance
es_port: string
Port for elasticsearch instance
Returns
-------
es_conn: an elasticsearch_dsl Search connection object.
"""
CLIENT = Elasticsearch([{'host' : es_ip, 'port' : es_port}])
S = Search(using=CLIENT, index="geonames")
return S
def ESConnection():
parser = configparser.ConfigParser()
conf_file = 'settings'
fd = open(conf_file, 'r')
parser.readfp(fd)
fd.close()
sections = parser.sections()
for section in sections:
options = parser.options(section)
for option in options:
if option == 'user': user = parser.get(section, option)
if option == 'password': password = parser.get(section, option)
if option == 'host': host = parser.get(section, option)
if option == 'port': port = parser.get(section, option)
if option == 'path': path = parser.get(section, option)
#if option == 'genderize_key': key = parser.get(section, option)
connection = "https://" + user + ":" + password + "@" + host + ":" + port + "/" + path
es_write = Elasticsearch([connection], verify_certs=False)
#es_write = Elasticsearch(["127.0.0.1:9200"])
return es_write
def __init__(self, start, end, telescopes=None, sites=None, instrument_types=None):
try:
self.es = Elasticsearch([settings.ELASTICSEARCH_URL])
except LocationValueError:
logger.error('Could not find host. Make sure ELASTICSEARCH_URL is set.')
raise ImproperlyConfigured('ELASTICSEARCH_URL')
self.instrument_types = instrument_types
self.available_telescopes = self._get_available_telescopes()
sites = list({tk.site for tk in self.available_telescopes}) if not sites else sites
telescopes = list({tk.telescope for tk in self.available_telescopes if tk.site in sites}) \
if not telescopes else telescopes
self.start = start.replace(tzinfo=timezone.utc).replace(microsecond=0)
self.end = end.replace(tzinfo=timezone.utc).replace(microsecond=0)
cached_event_data = cache.get('tel_event_data')
if cached_event_data:
self.event_data = cached_event_data
else:
self.event_data = self._get_es_data(sites, telescopes)
cache.set('tel_event_data', self.event_data, 1800)
def __init__(self, host='localhost', port='9200', protocol='http', path=None, user=None, password=None):
""" Class constructor
:param url: ElasticSearch host domain
:param port: ElasticSearch port connection
:param protocol: ElasticSearch protocol (typically http or https)
:param path: ElasticSearch patch connection
:param user: ElasticSearch user connection
:param password: ElasticSearch password connection
"""
credentials = ""
if user is not None or password is not None:
credentials = user + ":" + password + "@"
if path is None:
path = ""
connection = protocol + "://" + credentials + host + ":" + port + path
print(connection)
self.es = Elasticsearch([connection])
def __init__(self, host, test_id, debug=False):
"""
param host: the elasticsearch host
test_id: id of the test to which we are reqstricting the queires
"""
self._es = Elasticsearch(host)
self._id = test_id
self.debug=debug
self.functiondict = {
'no_proxy_errors' : self.check_no_proxy_errors,
'bounded_response_time' : self.check_bounded_response_time,
'http_success_status' : self.check_http_success_status,
'http_status' : self.check_http_status,
# 'reachability' : self.check_reachability,
'bounded_retries' : self.check_bounded_retries,
'circuit_breaker' : self.check_circuit_breaker,
'at_most_requests': self.check_at_most_requests
}
def get(doc_type, doc_id, fields=True):
"""
Get an Elasticsearch document.
:param basestring doc_type: document type
:param doc_id: document id, will be converted into basestring
:param fields: if ``False``, returns whether the document is found as bool;
if ``True``, returns the document dict; if list of string, returns the
document dict with only the specified fields.
:rtype: dict or bool
"""
ret = es.get(
index='oclubs',
doc_type=doc_type,
id=doc_id,
_source=fields
)
if fields is not False:
return ret['_source']
else:
return ret['found']
def use_store(self):
"""
Opens a database to save data
"""
logging.info('Using Elasticsearch database')
self.db = Elasticsearch(
[self.settings.get('host', 'localhost:9200')],
)
try:
self.db.indices.create(index='mcp-watch', ignore=400) # may exist
except ConnectionError as feedback:
logging.error('- unable to connect')
raise
return self.db
def reset_store(self):
"""
Opens a database for points
"""
logging.info('Resetting Elasticsearch database')
self.db = Elasticsearch(
[self.settings.get('host', 'localhost:9200')],
)
try:
self.db.indices.create(index='mcp-watch', ignore=400) # may exist
except ConnectionError as feedback:
logging.error('- unable to connect')
raise
return self.db
def __init__(self, config, workload, tool="browbeat", cache_size=1000, max_cache_time=10):
self.config = config
self.cache = deque()
self.max_cache_size = cache_size
self.last_upload = datetime.datetime.utcnow()
self.max_cache_age = datetime.timedelta(minutes=max_cache_time)
self.logger = logging.getLogger('browbeat.elastic')
self.es = elasticsearch.Elasticsearch([
{'host': self.config['elasticsearch']['host'],
'port': self.config['elasticsearch']['port']}],
send_get_body_as='POST'
)
self.workload = workload
today = datetime.datetime.today()
self.index = "{}-{}-{}".format(tool,
workload, today.strftime('%Y.%m.%d'))
def get_elasticsearch(check_availability=False):
"""Return Elasticsearch instance.
:param check_availability: check if nodes are available
:returns: Elasticsearch or None on failure
:rtype: elasticsearch.Elasticsearch
"""
nodes = config.get_config()["backend"]["connection"]
try:
es = elasticsearch.Elasticsearch(nodes)
if check_availability:
es.info()
except Exception as e:
LOG.warning(
"Failed to query Elasticsearch nodes %s: %s"
% (nodes, str(e)))
raise
return es
def init_es(timeout=TIMEOUT):
log.info("connecting to %s %s", settings.ELASTICSEARCH_URL, settings.ELASTICSEARCH_PORT)
auth = AWSRequestsAuth(aws_access_key=settings.AWS_ACCESS_KEY_ID,
aws_secret_access_key=settings.AWS_SECRET_ACCESS_KEY,
aws_host=settings.ELASTICSEARCH_URL,
aws_region='us-west-1',
aws_service='es')
auth.encode = lambda x: bytes(x.encode('utf-8'))
es = Elasticsearch(host=settings.ELASTICSEARCH_URL,
port=settings.ELASTICSEARCH_PORT,
connection_class=RequestsHttpConnection,
timeout=timeout,
max_retries=10, retry_on_timeout=True,
http_auth=auth)
return es
def setUp(self):
# Clean index
self.es = Elasticsearch(hosts=[LOCAL_ELASTICSEARCH])
try:
self.es.indices.delete(index='datahub')
self.es.indices.delete(index='events')
except NotFoundError:
pass
self.es.indices.create('datahub')
mapping = {'dataset': {'properties': self.MAPPING}}
self.es.indices.put_mapping(doc_type='dataset',
index='datahub',
body=mapping)
self.es.indices.create('events')
mapping = {'event': {'properties': {'timestamp': {'type': 'date'}}}}
self.es.indices.put_mapping(doc_type='event',
index='events',
body=mapping)
def setup():
try:
from elasticsearch import Elasticsearch, ElasticsearchException
except ImportError:
raise unittest.SkipTest("elasticsearch-py not installed.")
es = Elasticsearch(settings.HAYSTACK_CONNECTIONS['elasticsearch']['URL'])
try:
es.info()
except ElasticsearchException as e:
raise unittest.SkipTest("elasticsearch not running on %r" % settings.HAYSTACK_CONNECTIONS['elasticsearch']['URL'], e)
global test_runner
global old_config
from django.test.runner import DiscoverRunner
test_runner = DiscoverRunner()
test_runner.setup_test_environment()
old_config = test_runner.setup_databases()
def __create_connection(self, config):
kwargs = {
'host': config['HOST'],
'port': config.get('PORT', 9200),
'use_ssl': config.get('USE_SSL', False),
'verify_certs': True,
'ca_certs': certifi.where()
}
if 'AWS_ACCESS_KEY' in config and \
'AWS_SECRET_KEY' in config and \
'AWS_REGION' in config:
kwargs['connection_class'] = RequestsHttpConnection
kwargs['http_auth'] = AWSRequestsAuth(
aws_access_key=config['AWS_ACCESS_KEY'],
aws_secret_access_key=config['AWS_SECRET_KEY'],
aws_host=config['HOST'],
aws_region=config['AWS_REGION'],
aws_service='es')
es = Elasticsearch(**kwargs)
es._index = config['INDEX_NAME']
es._settings = config.get('INDEX_SETTINGS', DEFAULT_INDEX_SETTINGS)
return es
def destroy(self):
"""
Deletes all elasticsearch history for Alphabay. It will delete all
indexes matching:
dminer-alphabay-*
It will also delete the template for indexes. This template is named:
dminer-alphabay-template
"""
es = Elasticsearch([":".join([str(self.host), str(self.port)])])
self.logger.info("Deleting index: dminer-alphabay-*")
es.indices.delete("dminer-alphabay-*")
self.logger.info("Deleting index template: dminer-dreammarket-template")
es.indices.delete_template("dminer-alphabay-template")
def destroy(self):
"""
Deletes all elasticsearch history for Hansa. It will delete all
indexes matching:
dminer-hansa-*
It will also delete the template for indexes. This template is named:
dminer-hansa-template
"""
es = Elasticsearch([":".join([str(self.host), str(self.port)])])
self.logger.info("Deleting index: dminer-hansa-*")
es.indices.delete("dminer-hansa-*")
self.logger.info("Deleting index template: dminer-dreammarket-template")
es.indices.delete_template("dminer-hansa-template")
def destroy(self):
"""
Deletes all elasticsearch history for DreamMarket. It will delete all
indexes matching:
dminer-dreammarket-*
It will also delete the template for indexes. This template is named:
dminer-dreammarket-template
"""
es = Elasticsearch([":".join([str(self.host), str(self.port)])])
self.logger.info("Deleting index: dminer-dreammarket-*")
es.indices.delete("dminer-dreammarket-*")
self.logger.info("Deleting index template: dminer-dreammarket-template")
es.indices.delete_template("dminer-dreammarket-template")
def getKibiRelationConfig(indexName=".kibi", typeName="config" , elasticPort=9220, elasticHost="localhost"):
es = Elasticsearch([{'host': elasticHost, 'port': elasticPort}], http_auth=(elasticUsername, elasticPassword))
mapping = es.search(
index=indexName,
doc_type=typeName,
size=1000,
request_timeout=1060,
body={
'query': {
'filtered': {
'query': {
'match_all': {}
}
}
}
}
)
return mapping['hits']['hits'][0]
#return mapping[SourceIndexName]["mappings"][SourceTypeName]
def lambda_handler(event, context):
es = Elasticsearch(os.environ['ELASTICSEARCH_URL'])
indices = es.indices.get('*')
for record in event['Records']:
# Kinesis data is base64 encoded so decode here
payload = base64.b64decode(record['kinesis']['data'])
create_object_payload = json.loads(payload)
doc_type_name = create_object_payload['event_type']
index_name = create_object_payload['event_payload']['index_name']
payload_data = create_object_payload['event_payload']['data']
index_terms = index_name.split('_')
del index_terms[-1]
index_prefix = '_'.join(index_terms)
latest_index_name = max(filter(lambda k: index_prefix in k, indices))
if latest_index_name != index_name:
index_name = latest_index_name
res = es.index(index=index_name, doc_type=doc_type_name, id=str(create_object_payload['object_id']), body=payload_data)
for conn in es.transport.connection_pool.connections:
conn.pool.close()
return 'Successfully processed {} records.'.format(len(event['Records']))
def lambda_handler(event, context):
es = Elasticsearch(os.environ['ELASTICSEARCH_URL'])
indices_list = es.indices.get('*')
for record in event['Records']:
# Kinesis data is base64 encoded so decode here
payload = base64.b64decode(record['kinesis']['data'])
update_object_payload = json.loads(payload)
doc_type_name = update_object_payload['event_type']
index_name = update_object_payload['event_payload']['index_name']
payload_data = update_object_payload['event_payload']['data']
index_terms = index_name.split('_')
del index_terms[-1]
index_prefix = '_'.join(index_terms)
latest_index_name = max(filter(lambda k: index_prefix in k, indices_list))
if latest_index_name != index_name:
index_name = latest_index_name
res = es.update(index=index_name, doc_type=doc_type_name, id=str(update_object_payload['object_id']), body={'doc': payload_data, 'doc_as_upsert':True})
for conn in es.transport.connection_pool.connections:
conn.pool.close()
return 'Successfully processed {} records.'.format(len(event['Records']))
def elasticsearch(process_fixture_name):
"""
Create Elasticsearch client fixture.
:param str process_fixture_name: elasticsearch process fixture name
"""
@pytest.fixture
def elasticsearch_fixture(request):
"""Elasticsearch client fixture."""
process = request.getfixturevalue(process_fixture_name)
if not process.running():
process.start()
hosts = '{0!s}:{1!s}'.format(process.host, process.port)
client = Elasticsearch(hosts=hosts)
def drop_indexes():
client.indices.delete(index='*')
request.addfinalizer(drop_indexes)
return client
return elasticsearch_fixture
def test_es():
"""
Before running other tests, ensure connection to ES is established
"""
es = Elasticsearch()
try:
es.indices.create(INDEX)
es.indices.delete(INDEX)
return True
except RequestError:
print('Index already exists: skipping tests.')
return False
except ConnectionError:
print('The ElasticSearch backend is not running: skipping tests.')
return False
except Exception as e:
print('An unknown error occured connecting to ElasticSearch: %s' % e)
return False
def clustering_pubs_page(request):
if request.GET.get('index_name') is not None and request.GET.get('index_name') != '':
dci = DocClusteringInfo()
dci.doc_type = "publication"
dci.cost = -1
dci.iter = -1
dci.k = -1
dci.index_name = request.GET.get('index_name')
dci.save()
if request.GET.get('k') is not None:
clustering_pubs.tasks.cluster_index.delay(dci.index_name, k=int(request.GET.get('k')))
else:
clustering_pubs.tasks.cluster_index.delay(dci.index_name)
return redirect("/clustering_pubs/status/%s/" % dci.index_name)
es = Elasticsearch()
indexes = es.indices.get_mapping()
return render(request, 'clustering_pubs.html', {'indexes': indexes})
def retrieve_dataset(index_name, doc_type, weight={'title': 5, 'abstract': 1}):
es = Elasticsearch()
results = es.search(index=index_name, doc_type=doc_type, size=10000)['hits']['hits']
dataset = {}
for res in results:
doc = DocumentInfo(res['_id'])
term_vectors = es.termvectors(index=index_name, doc_type=doc_type, id=res['_id'], offsets=False,
payloads=False, positions=False, fields='title,abstract',
field_statistics=False)['term_vectors']
for zone in {'abstract', 'title'}:
term_vector = term_vectors[zone]['terms']
for term in term_vector:
stemmed = stem(term)
if stemmed.isalpha():
if stemmed not in doc.tf:
doc.tf[stemmed] = term_vector[term]['term_freq'] * weight[zone]
else:
doc.tf[stemmed] += term_vector[term]['term_freq'] * weight[zone]
dataset[res['_id']] = doc
return dataset
def indexing_status_page(request, id):
es = Elasticsearch()
crawl_info = CrawlInfo.objects.get(id=id)
try:
es.indices.refresh(index="index-%d" % crawl_info.id)
percentage = int(es.count("index-%d" % crawl_info.id, crawl_info.type).get('count') * 100 /
crawl_info.successful_crawls)
percentage = max(1, percentage)
except Exception as e:
percentage = 0
if request.GET.get('type', 'HTML') == 'JSON':
result = json.dumps({'status': 'OK', 'percent': percentage},
ensure_ascii=False, encoding='utf8')
return HttpResponse(result, content_type='application/json; charset=utf-8')
return render(request, 'indexing_status.html', {'percent': percentage})
def get_elasticsearch_object():
""" Creating an elasticsearch object to query the index """
try:
es_servers = settings.ELASTICSEARCH_SERVERS
es_servers = es_servers if isinstance(es_servers, list) \
else [es_servers]
except AttributeError:
es_servers = ["http://localhost:9200"]#["https://ahmia.fi/esconnection/"]
try:
timeout = settings.ELASTICSEARCH_TIMEOUT
except AttributeError:
timeout = 60
es_obj = Elasticsearch(hosts=es_servers,
timeout=timeout)
return es_obj