def tokenize_text( sample_text ):
global sequence_lengths
processed_text = []
if cfg.remove_punctuation:
cleaned = sample_text.lower().translate( t_table )
else:
cleaned = sample_text
if cfg.use_casual_tokenizer:
tokens = tknzr.tokenize( cleaned )
else:
tokens = nltk.word_tokenize( cleaned, language='english')
if cfg.remove_stopwords:
tokens = [w for w in tokens if not w in stopwords.words('english')]
sequence_lengths.append( len( tokens ) )
processed_text.extend( tokens )
return processed_text
preprocess_data.py 文件源码
python
阅读 26
收藏 0
点赞 0
评论 0
评论列表
文章目录