def setUp(self):
self.dlm = DelimiterTokenizer(delim_set=[' '], return_set=True)
self.A = pd.DataFrame([{'l_id': 1, 'l_attr':'ab cd ef aa bb'},
{'l_id': 2, 'l_attr':''},
{'l_id': 3, 'l_attr':'ab'},
{'l_id': 4, 'l_attr':'ll oo pp'},
{'l_id': 5, 'l_attr':'xy xx zz fg'},
{'l_id': 6, 'l_attr':pd.np.NaN}])
self.B = pd.DataFrame([{'r_id': 1, 'r_attr':'mn'},
{'r_id': 2, 'r_attr':'he ll'},
{'r_id': 3, 'r_attr':'xy pl ou'},
{'r_id': 4, 'r_attr':'aa'},
{'r_id': 5, 'r_attr':'fg cd aa ef'},
{'r_id': 6, 'r_attr':None}])
# generate cartesian product A x B to be used as candset
self.A['tmp_join_key'] = 1
self.B['tmp_join_key'] = 1
self.C = pd.merge(self.A[['l_id', 'tmp_join_key']],
self.B[['r_id', 'tmp_join_key']],
on='tmp_join_key').drop('tmp_join_key', 1)
self.empty_A = pd.DataFrame(columns=['l_id', 'l_attr'])
self.empty_B = pd.DataFrame(columns=['r_id', 'r_attr'])
self.empty_candset = pd.DataFrame(columns=['l_id', 'r_id'])
test_overlap_filter.py 文件源码
python
阅读 61
收藏 0
点赞 0
评论 0
评论列表
文章目录