def _scan_fingerprints(dataset_name=None):
if dataset_name:
q = {'term': {'dataset': dataset_name}}
else:
q = {'match_all': {}}
q = {
'query': q,
'_source': ['fingerprints', 'dataset']
}
scan_iter = scan(es, query=q, index=es_index, doc_type=Schema.ENTITY)
for i, doc in enumerate(scan_iter):
source = doc.get('_source')
fps = source.get('fingerprints')
if fps is None:
continue
for fp in fps:
if fp is None:
continue
yield fp, source.get('dataset')
if i != 0 and i % 10000 == 0:
log.info("Crossref: %s entities...", i)
评论列表
文章目录