def __init__(self):
import nltk
from nltk.tag import PerceptronTagger
from nltk.tokenize import TreebankWordTokenizer
tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
# Load the tagger
self.tagger = PerceptronTagger(load=False)
self.tagger.load(tagger_fn)
# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
# Calling the TreebankWordTokenizer like this allows skipping the downloader.
# It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
# https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
self.tokenize = TreebankWordTokenizer().tokenize
self.sent_detector = nltk.data.load(tokenizer_fn)
# http://www.nltk.org/book/ch05.html
评论列表
文章目录