def process_results(self, results):
'''
Processes a page full of results.
Saves pdf for each result.
'''
try:
articles = results[0]['records']['article']
with open(self.csv_file, 'ab') as csv_file:
writer = csv.DictWriter(csv_file, FIELDS, encoding='utf-8')
if self.harvested == 0:
writer.writeheader()
for article in articles:
article_id = article['id']
row = self.prepare_row(article)
writer.writerow(row)
if self.pdf:
pdf_url = self.get_pdf_url(article_id)
if pdf_url:
pdf_filename = self.make_filename(article)
pdf_file = os.path.join(self.data_dir, 'pdf', '{}.pdf'.format(pdf_filename))
urlretrieve(pdf_url, pdf_file)
if self.text:
text = article.get('articleText')
if text:
text_filename = self.make_filename(article)
text = re.sub('<[^<]+?>', '', text)
text = re.sub("\s\s+", " ", text)
text_file = os.path.join(self.data_dir, 'text', '{}.txt'.format(text_filename))
with open(text_file, 'wb') as text_output:
text_output.write(text.encode('utf-8'))
time.sleep(0.5)
self.harvested += self.get_highest_n(results)
print('Harvested: {}'.format(self.harvested))
except KeyError:
pass
评论列表
文章目录