test_corpora_hashdictionary.py 文件源码

python
阅读 20 收藏 0 点赞 0 评论 0

项目:topical_word_embeddings 作者: thunlp 项目源码 文件源码
def testDocFreqAndToken2IdForSeveralDocsWithOneWord(self):
        # two docs
        texts = [['human'], ['human']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {31002: 2}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 31002}
        self.assertEqual(d.token2id['human'], expected['human'])
        self.assertEqual(d.token2id.keys(), expected.keys())

        # three docs
        texts = [['human'], ['human'], ['human']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {31002: 3}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 31002}
        self.assertEqual(d.token2id['human'], expected['human'])
        self.assertEqual(d.token2id.keys(), expected.keys())

        # four docs
        texts = [['human'], ['human'], ['human'], ['human']]
        d = HashDictionary(texts, myhash=zlib.adler32)
        expected = {31002: 4}
        self.assertEqual(d.dfs, expected)
        # only one token (human) should exist
        expected = {'human': 31002}
        self.assertEqual(d.token2id['human'], expected['human'])
        self.assertEqual(d.token2id.keys(), expected.keys())
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号