def process(self, fc, context=None):
text_source = self.config.get('text_source')
if text_source and text_source in fc:
text = fc[text_source]
else:
return fc
names = defaultdict(StringCounter)
for sent in nltk.sent_tokenize(text):
for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
if hasattr(chunk, 'label'):
label = chunk.label()
name = ' '.join(c[0] for c in chunk.leaves())
if not isinstance(name, unicode):
name = unicode(name, 'utf-8')
name = cleanse(name)
#print chunk.node, name
names[label][name] += 1
for entity_type, name_counts in names.items():
fc[entity_type] = name_counts
return fc
评论列表
文章目录