def train(self):
train_data = nltk.corpus.brown.tagged_sents(categories='news')
regexp_tagger = nltk.RegexpTagger([
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
(r'(-|:|;)$', ':'),
(r'\'*$', 'MD'),
(r'(The|the|A|a|An|an)$', 'AT'),
(r'.*able$', 'JJ'),
(r'^[A-Z].*$', 'NNP'),
(r'.*ness$', 'NN'),
(r'.*ly$', 'RB'),
(r'.*s$', 'NNS'),
(r'.*ing$', 'VBG'),
(r'.*ed$', 'VBD'),
(r'.*', 'NN'),
])
unigram_tagger = nltk.UnigramTagger(train_data, backoff=regexp_tagger)
self.tagger = nltk.BigramTagger(train_data, backoff=unigram_tagger)
self._trained = True
return None
np_extractors.py 文件源码
python
阅读 48
收藏 0
点赞 0
评论 0
评论列表
文章目录