def testDocFreqAndToken2IdForSeveralDocsWithOneWord(self):
# two docs
texts = [['human'], ['human']]
d = HashDictionary(texts, myhash=zlib.adler32)
expected = {31002: 2}
self.assertEqual(d.dfs, expected)
# only one token (human) should exist
expected = {'human': 31002}
self.assertEqual(d.token2id['human'], expected['human'])
self.assertEqual(d.token2id.keys(), expected.keys())
# three docs
texts = [['human'], ['human'], ['human']]
d = HashDictionary(texts, myhash=zlib.adler32)
expected = {31002: 3}
self.assertEqual(d.dfs, expected)
# only one token (human) should exist
expected = {'human': 31002}
self.assertEqual(d.token2id['human'], expected['human'])
self.assertEqual(d.token2id.keys(), expected.keys())
# four docs
texts = [['human'], ['human'], ['human'], ['human']]
d = HashDictionary(texts, myhash=zlib.adler32)
expected = {31002: 4}
self.assertEqual(d.dfs, expected)
# only one token (human) should exist
expected = {'human': 31002}
self.assertEqual(d.token2id['human'], expected['human'])
self.assertEqual(d.token2id.keys(), expected.keys())
test_corpora_hashdictionary.py 文件源码
python
阅读 20
收藏 0
点赞 0
评论 0
评论列表
文章目录