def tag_by_phrase(self, tag_query, source):
print('query=%s source=%s' % (json.dumps(tag_query), source))
resp = self.es.search(index='fcc-comments', body=tag_query, size=0)
total = resp['hits']['total']
print('tagging %s / %s matches' % (self.limit, total))
docs = []
for doc in scan(self.es, index='fcc-comments', query=tag_query, size=1000):
docs.append(lib.bulk_update_doc(doc['_id'], {'source': source}))
if not len(docs) % 1000:
print('\tfetched %s\n%s\t%s' % (len(docs), doc['_id'], doc['_source']['text_data'][:400]))
if len(docs) >= self.limit:
break
print('indexing %s' % (len(docs)))
tagged = lib.bulk_update(self.es, docs)
print('tagged %s / %s matches' % (tagged, total))
return tagged
评论列表
文章目录