def run(self):
'''
get documents without a sentiment tag that match significant terms:
- significant terms from postive regex tagged vs others
- extra multi match clause for stronger terms (in multiple term sets:
positive vs negative, untagged, and all
- phrase match net neutrality since both terms score high
'''
index_queue = multiprocessing.Queue()
bulk_index_process = multiprocessing.Process(
target=self.bulk_index, args=(index_queue,),
)
bulk_index_process.start()
fetched = 0
try:
while fetched < self.limit:
'''
use search instead of scan because keeping an ordered scan cursor
open negates the performance benefits
'''
resp = self.es.search(index='fcc-comments', body=self.query, size=self.limit)
for doc in resp['hits']['hits']:
index_queue.put(doc['_id'])
fetched += 1
if not fetched % 100:
print('%s\t%s\t%s' % (fetched, doc['_score'],
doc['_source']['text_data']))
except ConnectionTimeout:
print('error fetching: connection timeout')
index_queue.put(None)
bulk_index_process.join()
评论列表
文章目录