def __init__(self):
import nltk
from nltk.tag import PerceptronTagger
from nltk.tokenize import TreebankWordTokenizer
tokenizer_fn = os.path.abspath(resource_filename('phrasemachine.data', 'punkt.english.pickle'))
tagger_fn = os.path.abspath(resource_filename('phrasemachine.data', 'averaged_perceptron_tagger.pickle'))
# Load the tagger
self.tagger = PerceptronTagger(load=False)
self.tagger.load(tagger_fn)
# note: nltk.word_tokenize calls the TreebankWordTokenizer, but uses the downloader.
# Calling the TreebankWordTokenizer like this allows skipping the downloader.
# It seems the TreebankWordTokenizer uses PTB tokenization = regexes. i.e. no downloads
# https://github.com/nltk/nltk/blob/develop/nltk/tokenize/treebank.py#L25
self.tokenize = TreebankWordTokenizer().tokenize
self.sent_detector = nltk.data.load(tokenizer_fn)
# http://www.nltk.org/book/ch05.html
python类TreebankWordTokenizer()的实例源码
def get(self, text=["medical"]):
if type(text) == str:
text = text.lower()
text = TreebankWordTokenizer().tokenize(text)
try:
data = np.array(map(self.vocab.get, text))
return self.onehot(data), data
except:
unknowns = []
for word in text:
if self.vocab.get(word) == None:
unknowns.append(word)
raise Exception(" [!] unknown words: %s" % ",".join(unknowns))
mongoreader.py 文件源码
项目:Natural-Language-Processing-Python-and-NLTK
作者: PacktPublishing
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def __init__(self, word_tokenizer=TreebankWordTokenizer(),
sent_tokenizer=LazyLoader('tokenizers/punkt/PY3/english.pickle'),
**kwargs):
self._seq = MongoDBLazySequence(**kwargs)
self._word_tokenize = word_tokenizer.tokenize
self._sent_tokenize = sent_tokenizer.tokenize
def stem_and_tokenize_text(text):
sents = sent_tokenize(text)
tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents]))
terms = [Term(token) for token in tokens]
return filter(lambda term: not term.is_punctuation(), terms)
def SentenceTokenize(self, text):
tokens = TreebankWordTokenizer().tokenize(text)
return tokens
def get(self, text=["medical"]):
if type(text) == str:
text = text.lower()
text = TreebankWordTokenizer().tokenize(text)
try:
data = np.array(map(self.vocab.get, text))
return self.onehot(data), data
except:
unknowns = []
for word in text:
if self.vocab.get(word) == None:
unknowns.append(word)
raise Exception(" [!] unknown words: %s" % ",".join(unknowns))
def tokenize(sentence):
"Tokenize sentence the way parser expects."
tokenizer = TreebankWordTokenizer()
s = tokenizer.tokenize(sentence)
s = ' '.join(s)
# character replacements
s = ''.join(REPLACEMENTS_R.get(x,x) for x in s)
return s
def fresh(self, s, tokenized=False):
"""UD-parse and POS-tag sentence `s`. Returns (UDParse, PTB-parse-string).
Pass in `tokenized=True` if `s` has already been tokenized, otherwise we
apply `nltk.tokenize.TreebankWordTokenizer`.
"""
if self.process is None:
self._start_subprocess()
s = str(s.strip())
if not tokenized:
s = tokenize(s)
s = s.strip()
assert '\n' not in s, "No newline characters allowed %r" % s
try:
self.process.stdin.write(s.encode('utf-8'))
except IOError as e:
#if e.errno == 32: # broken pipe
# self.process = None
# return self(s) # retry will restart process
raise e
self.process.stdin.write(b'\n')
self.process.stdin.flush()
out = self.process.stdout.readline()
if sys.version_info[0] == 3:
out = out.decode()
return self.to_ud(out)
def __prepare__(self):
"""
"""
conversations = open(path.join(self.BASE_PATH, self.CONVS_FILE), 'r').readlines()
movie_lines = open(path.join(self.BASE_PATH, self.LINES_FILE), 'r').readlines()
tbt = TreebankWordTokenizer().tokenize
self.words_set = set()
self.lines_dict = {}
for i, line in enumerate(movie_lines):
parts = map(lambda x: x.strip(), line.lower().split(self.FILE_SEP))
tokens = tbt(parts[-1])
self.lines_dict[parts[0]] = tokens
self.words_set |= set(tokens)
self.word2idx = {}
self.word2idx[self.PAD_TOKEN] = 0
self.word2idx[self.EOS_TOKEN] = 1
self.word2idx[self.GO_TOKEN] = 2
for i, word in enumerate(self.words_set):
self.word2idx[word] = i + 3
self.idx2word = [0] * len(self.word2idx)
for w, i in self.word2idx.items():
self.idx2word[i] = w
# extract pairs of lines in a conversation (s0, s1, s2) -> {(s0, s1), (s1, s2)}
utt_pairs = []
for line in conversations:
parts = map(lambda x: x[1:-1], map(lambda x: x.strip(), line.lower().split(self.FILE_SEP))[-1][1:-1].split(', '))
utt_pairs += list(pairwise(parts))
utt_pairs = np.random.permutation(utt_pairs)
train_utt_pairs = utt_pairs[self.VAL_COUNT:]
self.val_pairs = utt_pairs[:self.VAL_COUNT]
def find_bucket(enc_size, dec_size, buckets):
return next(dropwhile(lambda x: enc_size > x[0] or dec_size > x[1], buckets), None)
for pair in train_utt_pairs:
bckt = find_bucket(len(self.lines_dict[pair[0]]), len(self.lines_dict[pair[1]]), self.bucket_sizes)
if bckt is None:
self.bucket_pairs[(-1, -1)].append(pair)
else:
self.bucket_pairs[bckt].append(pair)
self.bucket_ordering = []
for bckt, _ in sorted(map(lambda x: (x[0], len(x[1])), self.bucket_pairs.items()), key=lambda x: x[1], reverse=True):
self.bucket_ordering.append(bckt)
def transform(self, X, **transform_params):
#sparse matrix with occurrences nxm
# n : number of docs
# m : size of lexicon
features = np.empty((len(X),len(self.lexicon)))
for docid,doc in enumerate(X):
if self.preprocessor is not None:
doc = self.preprocessor(doc)
tokens = TreebankWordTokenizer().tokenize(doc)
bigrams = [" ".join(i) for i in ngrams(tokens,2)]
doctokens = tokens + bigrams
tokencounts = Counter(doctokens)
match = set(tokencounts.keys()) & set(self.lexicon["ngram"])
if len(match) > 0 :
#occurrences vector
occurrences = self.lexicon["ngram"].map(lambda w : w in match)
ovec = csr_matrix(occurrences)
#polarity vector
pvec = csr_matrix(self.lexicon["polarity"])
#counts vector
counts = self.lexicon["ngram"].map(lambda w : tokencounts[w] if w in match else 0 )
cvec = csr_matrix(counts)
if self.polarity:
if self.weightedcount:
vector = ovec.multiply(pvec).multiply(cvec)
else :
vector = ovec.multiply(pvec)
else :
if self.weightedcount:
vector = ovec.multiply(cvec)
else :
vector = ovec
vector = vector.todense()
else:
#can't skip because np.empty is > 0
vector = np.zeros(len(self.lexicon))
features[docid] = vector
return csr_matrix(features)