def test_stems(self):
import nltk
stemmer = nltk.PorterStemmer(nltk.PorterStemmer.MARTIN_EXTENSIONS)
stops = frozenset(nltk.corpus.stopwords.words('english'))
tests = [("foo bar", ['foo', 'bar']),
("foo $1.23 is the bar", ['foo', 'bar']),
("a b c d", []), # assume single char stems are useless
("ab cd", ['ab', 'cd']),
("-1.23 1.23 foo", ['foo']),
("-123 foo 123", ['foo']),
("8:12 12:34am foo", ['foo']),
("ab. foo, then bar", ['ab', 'foo', 'bar']),
("crying infants", ["cry", "infant"]),
("drop 12 all 3.45 the 0.123 numbers", ['drop', 'number'])]
for test, exp in tests:
obs = list(stems(stops, stemmer, test))
self.assertEqual(obs, exp)
评论列表
文章目录