def sent_tokenize(text, lang='english'):
"""
Punkt sentence tokenizer from NLTK.
"""
global _nltk_sent_tokenizer
try:
_nltk_sent_tokenizer
except NameError:
# If the sentence tokenizer wasn't previously initialized.
available_languages = ['czech', 'danish', 'dutch', 'english',
'estonian', 'finnish', 'french', 'german',
'greek', 'italian', 'norwegian', 'polish',
'portuguese', 'slovene', 'spanish', 'swedish',
'turkish']
assert lang in available_languages, "Punkt Tokenizer for {} not available".format(lang)
# Checks that the punkt tokenizer model was previously downloaded.
download('punkt', quiet=True)
path_to_punkt = _nltk_downloader._download_dir + '/tokenizers/punkt/{}.pickle'.format(lang)
with open(path_to_punkt, 'rb') as fin:
_nltk_sent_tokenizer = pickle.load(fin)
# Actual tokenization using the Punkt Model.
return _nltk_sent_tokenizer.tokenize(text)
评论列表
文章目录