def run(self):
date_path = self.search['date_path']
files = sorted(os.listdir('data/%s/media' % date_path))
hashes = {}
matches = []
g = nx.Graph()
update_block_size = get_block_size(len(files), 5)
for i in range(len(files)):
f = files[i]
fn = 'data/%s/media/%s' % (date_path, f)
ahash = imagehash.average_hash(Image.open(fn))
dhash = imagehash.dhash(Image.open(fn))
phash = imagehash.phash(Image.open(fn))
hashes[f] = {'ahash': ahash, 'dhash': dhash, 'phash': phash}
for j in range(0, i):
f2name = files[j]
f2 = hashes[f2name]
sumhash = sum([ahash - f2['ahash'],
dhash - f2['dhash'],
phash - f2['phash']])
# FIXME: 40 is a hard-coded arbitrary (eyeballed) threshold
if sumhash <= 40:
matches.append([f, files[j],
ahash - f2['ahash'],
dhash - f2['dhash'],
phash - f2['phash'],
sumhash])
g.add_edge(f, f2name)
if i % update_block_size == 0:
self.update_job(
date_path=self.search['date_path'],
status="STARTED: %s - %s/%s" %
(self.task_family, i, len(files))
)
with self.output().open('w') as fp_graph:
components = list(nx.connected_components(g))
# Note: sets are not JSON serializable
d = []
for s in components:
d.append(list(s))
json.dump(d, fp_graph, indent=2)
评论列表
文章目录