def zhwiki2chars(in_file, out_file):
reg = re.compile(r'^[a-zA-Z]+$')
def _isalpha(string):
return reg.match(string) is not None
i = 0
out = open(out_file, 'w')
wiki = WikiCorpus(in_file, lemmatize=False, dictionary={})
for article in wiki.get_texts():
tokens = []
for token in article:
token = token.decode("utf-8").strip()
if _isalpha(token):
continue
tokens.append(" ".join(token)) # divided by character
out.write(" ".join(tokens) + "\n")
i += 1
if i % 10000 == 0:
print("process %d articles" % i)
out.close()
评论列表
文章目录