summarize.py 文件源码-python代码片段

def run(self):
        date_path = self.search['date_path']
        files = sorted(os.listdir('data/%s/media' % date_path))
        hashes = {}
        matches = []
        g = nx.Graph()
        update_block_size = get_block_size(len(files), 5)
        for i in range(len(files)):
            f = files[i]
            fn = 'data/%s/media/%s' % (date_path, f)
            ahash = imagehash.average_hash(Image.open(fn))
            dhash = imagehash.dhash(Image.open(fn))
            phash = imagehash.phash(Image.open(fn))
            hashes[f] = {'ahash': ahash, 'dhash': dhash, 'phash': phash}
            for j in range(0, i):
                f2name = files[j]
                f2 = hashes[f2name]
                sumhash = sum([ahash - f2['ahash'],
                               dhash - f2['dhash'],
                               phash - f2['phash']])
                # FIXME: 40 is a hard-coded arbitrary (eyeballed) threshold
                if sumhash <= 40:
                    matches.append([f, files[j],
                                    ahash - f2['ahash'],
                                    dhash - f2['dhash'],
                                    phash - f2['phash'],
                                    sumhash])
                    g.add_edge(f, f2name)
            if i % update_block_size == 0:
                self.update_job(
                    date_path=self.search['date_path'],
                    status="STARTED: %s - %s/%s" %
                           (self.task_family, i, len(files))
                )
        with self.output().open('w') as fp_graph:
            components = list(nx.connected_components(g))
            # Note: sets are not JSON serializable
            d = []
            for s in components:
                d.append(list(s))
            json.dump(d, fp_graph, indent=2)