def tag(text, tt_home):
# Default NLTK's tokenizer
# TreebankWordTokenizer + PunktSentenceTokenizer
nltk_start = time()
tokens = word_tokenize(text)
# Default NLTK's POS tagger
# ?
# Use tagset='universal' for universal tagset
nltk_tagged = pos_tag(tokens)
nltk_end = time()
nltk_execution = nltk_end - nltk_start
logger.info("NLTK took %f seconds" % nltk_execution)
# TreeTagger wrapper
# Tokenization: ?
# Default language: English
# English: trained on Penn treebank
# Default flags: -token -lemma -sgml -quiet -no-unknown
tt_start = time()
tt = TreeTagger(TAGDIR=tt_home)
raw_tags = tt.tag_text(text)
tt_end = time()
tt_execution = tt_end - tt_start
tt_tagged = make_tags(raw_tags)
logger.info("TreeTagger took %f seconds" % tt_execution)
return (nltk_tagged, nltk_execution), (tt_tagged, tt_execution)
评论列表
文章目录