def run_query(src, fn, rows=200):
print('writing data for %s to %s' % (source, fn))
query['query']['function_score']['query']['bool']['must'][0]['term']['analysis.source'] = source
emails = set()
batches = 0
print(json.dumps(query))
total = None
with open(fn, 'w', newline='') as outfile:
writer = csv.writer(outfile, delimiter='\t', quoting=csv.QUOTE_MINIMAL)
writer.writerow(['email', 'name', 'date', 'comment', 'url'])
while len(emails) < rows and batches < 10:
offset = batches * 100
if total and offset > total:
break
resp = es.search(index='fcc-comments', body=query, size=100, from_=offset)
if batches == 0:
total = resp['hits']['total']
print('\t%s matches' % (total))
else:
print('\tbatch %s: have %s' % (batches+1, len(emails)))
batches += 1
for doc in resp['hits']['hits']:
if len(emails) == rows:
break
data = doc['_source']
if data['contact_email'] in emails:
continue
emails.add(data['contact_email'])
writer.writerow([data['contact_email'], data['filers'][0]['name'],
data['date_received'], data['text_data'],
'https://www.fcc.gov/ecfs/filing/%s' % doc['_id']
])
评论列表
文章目录