def tag_text(self, text):
'''take input text and return tokens w/ part of speech tags using NLTK'''
# putting import here instead of top of file b.c. not all will have nltk installed
sents = self.sent_detector.tokenize(text) # TODO: this will fail on some unicode chars. I think assumes ascii
word_pos_pairs = []
all_tokens = []
for sent in sents:
tokens = self.tokenize(sent)
all_tokens = all_tokens + tokens
word_pos_pairs = word_pos_pairs + self.tagger.tag(tokens)
return {'tokens': all_tokens, 'pos': [tag for (w,tag) in word_pos_pairs]}
评论列表
文章目录