def retrieve_dataset(index_name, doc_type, weight={'title': 5, 'abstract': 1}):
es = Elasticsearch()
results = es.search(index=index_name, doc_type=doc_type, size=10000)['hits']['hits']
dataset = {}
for res in results:
doc = DocumentInfo(res['_id'])
term_vectors = es.termvectors(index=index_name, doc_type=doc_type, id=res['_id'], offsets=False,
payloads=False, positions=False, fields='title,abstract',
field_statistics=False)['term_vectors']
for zone in {'abstract', 'title'}:
term_vector = term_vectors[zone]['terms']
for term in term_vector:
stemmed = stem(term)
if stemmed.isalpha():
if stemmed not in doc.tf:
doc.tf[stemmed] = term_vector[term]['term_freq'] * weight[zone]
else:
doc.tf[stemmed] += term_vector[term]['term_freq'] * weight[zone]
dataset[res['_id']] = doc
return dataset
评论列表
文章目录