def similarities(self):
"""
Compute Levenshtein distance matrix between files (implemented in C++ pip package: editdistance)
Later: https://docs.python.org/2/library/difflib.html
:return:
"""
ucos = sorted(self.filedb.keys())
sims = {}
for idx, uco in enumerate(ucos):
logger.info('Comparing %s...' % uco)
sims[uco] = {}
for idx2, uco2 in enumerate(ucos[idx+1:]):
dist = editdistance.eval(self.file_data[uco], self.file_data[uco2])
sims[uco][uco2] = dist
logger.info(' %6d vs %6d : %4d %s %s' % (uco, uco2, dist, self.filedb[uco], self.filedb[uco2]))
评论列表
文章目录