def add_full_stops_to_the_end(infile, outfile):
#clean data of small titles nad add full stops for NLTK to work
output_format = '{}.\n'.format
with open(infile) as fin, codecs.open(outfile, 'w+', 'utf-8') as fout:
for line in fin:
if line[0] == ' ':
pass
#ignore headlines with less than three words
elif len(line.split()) <= 3:
pass
elif line.endswith('.\n') or line.endswith('!\n') or line.endswith('?\n') or line.endswith('!\n') or line.endswith('\'\n') or line.endswith('"\n'):
print >> fout, line.decode('utf-8'),
else:
print >> fout, output_format(line.strip()).decode('utf-8'),
############################################
# Convert All except first word and quotes
# to lower case #
############################################
评论列表
文章目录