def preprocess(s, max_tokens):
#s = unicode(s, ignore="errors")
s = s.lower()
s = re.sub(r'[^\x00-\x7F]+',' ', s)
s = re.sub("<s>", "", s)
s = re.sub("<eos>", "", s)
s = remove_punctuation(s)
s = re.sub('\d','#',s)
s = re.sub('\n',' ',s)
s = re.sub(',',' ',s)
tokens = WhitespaceTokenizer().tokenize(s)
#s = replace_the_unfrequent(tokens)
if (len(tokens) > max_tokens):
tokens = tokens[:max_tokens]
s = " ".join(tokens)
return s, len(tokens)
preprocessed_data.py 文件源码
python
阅读 21
收藏 0
点赞 0
评论 0
评论列表
文章目录