_names.py 文件源码

python
阅读 88 收藏 0 点赞 0 评论 0

项目:memex-dossier-open 作者: dossier 项目源码 文件源码
def process(self, fc, context=None):
        text_source = self.config.get('text_source')
        if text_source and text_source in fc:
            text = fc[text_source]
        else:
            return fc
        names = defaultdict(StringCounter)
        for sent in nltk.sent_tokenize(text):
            for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))):
                if hasattr(chunk, 'label'):
                    label = chunk.label()
                    name = ' '.join(c[0] for c in chunk.leaves())
                    if not isinstance(name, unicode):
                        name = unicode(name, 'utf-8')
                    name = cleanse(name)
                    #print chunk.node, name
                    names[label][name] += 1
        for entity_type, name_counts in names.items():
            fc[entity_type] = name_counts
        return fc
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号