def convert_text2bin1(docs, writer):
global counter
for i, fi in enumerate(docs):
with open(os.path.join(curdir,"input","cnn","stories",fi),'r', encoding="UTF-8") as f:
wholetext=f.read().lower()
wholetext=re.sub(r'[^\x00-\x7F]+','', wholetext)
wholetext=re.sub(r"(\s?[\']\s+|\s+[\']\s?)"," ' ", wholetext)
wholetext=re.sub(r'(\s?[\"]\s+|\s+[\"]\s?)',' " ', wholetext)
wholetext=re.sub(r"(\'[s]\s+)"," 's ", wholetext)
wholetext=wholetext.replace("."," . ")
wholetext=wholetext.replace(","," , ")
wholetext=wholetext.replace('-',' - ')
wholetext=wholetext.replace('?',' ? ')
wholetext=wholetext.replace('(','( ')
wholetext=wholetext.replace(')',' )')
data=wholetext.split("@highlight")
news=data[0]
highlights=data[1].replace('\n\n','')
news=(" ".join(news.split('\n\n'))).strip()
sentences = sent_tokenize(news)
news = '<d> <p> ' + ' '.join(['<s> ' + sentence + ' </s>' for sentence in sentences]) + ' </p> </d>'
highlights = '<d> <p> <s> ' + highlights + ' </s> </p> </d>'
words = (news+" "+highlights).split()
counter.update(words)
tf_example = example_pb2.Example()
tf_example.features.feature['article'].bytes_list.value.extend([(' '.join(news.split())).encode('utf-8')])
tf_example.features.feature['abstract'].bytes_list.value.extend([(' '.join(highlights.split())).encode('utf-8')])
tf_example_str = tf_example.SerializeToString()
str_len = len(tf_example_str)
writer.write(struct.pack('q', str_len))
writer.write(struct.pack('%ds' % str_len, tf_example_str))
if i%3000==0:
print(int((float(i)/ len(docs))*100), "%")
print((float(len(docs))/ len(docs))*100, "%...." "converted\n\n")
评论列表
文章目录