def __init__(self, rtepair, stop=True, lemmatize=False):
"""
:param rtepair: a ``RTEPair`` from which features should be extracted
:param stop: if ``True``, stopwords are thrown away.
:type stop: bool
"""
self.stop = stop
self.stopwords = set(['a', 'the', 'it', 'they', 'of', 'in', 'to', 'is',
'have', 'are', 'were', 'and', 'very', '.', ','])
self.negwords = set(['no', 'not', 'never', 'failed', 'rejected',
'denied'])
# Try to tokenize so that abbreviations like U.S.and monetary amounts
# like "$23.00" are kept as tokens.
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer('([A-Z]\.)+|\w+|\$[\d\.]+')
#Get the set of word types for text and hypothesis
self.text_tokens = tokenizer.tokenize(rtepair.text)
self.hyp_tokens = tokenizer.tokenize(rtepair.hyp)
self.text_words = set(self.text_tokens)
self.hyp_words = set(self.hyp_tokens)
if lemmatize:
self.text_words = set(lemmatize(token) for token in self.text_tokens)
self.hyp_words = set(lemmatize(token) for token in self.hyp_tokens)
if self.stop:
self.text_words = self.text_words - self.stopwords
self.hyp_words = self.hyp_words - self.stopwords
self._overlap = self.hyp_words & self.text_words
self._hyp_extra = self.hyp_words - self.text_words
self._txt_extra = self.text_words - self.hyp_words
评论列表
文章目录