index.py 文件源码-python代码片段

def _update_search_index(self, doc_id, autocomplete_min_count):
        # FIXME: This is a bit unwiedly and I'd prefer there was a nicely
        #        scalable in-SQL solution, but unfortunately keeping the
        #        term frequencies for each document in a table makes
        #        the database size explode, so gzipped json-dumped counters
        #        it is for now :/
        with self._db as cur:
            terms_before = Counter(dict(
                cur.execute("SELECT term, cnt FROM text_vocab").fetchall()))
            cur.execute(UPDATE_INDEX_SINGLE_DOCUMENT, {'document_id': doc_id})
            terms_after = Counter(dict(
                cur.execute("SELECT term, cnt FROM text_vocab").fetchall()))
            doc_terms = Counter(dict(
                (term, cnt_after - terms_before.get('term', 0))
                for term, cnt_after in terms_after.items()
                if cnt_after != terms_before.get('term')))
            # Purge terms below threshold to save on size
            to_purge = []
            for term, cnt in doc_terms.items():
                if cnt < autocomplete_min_count:
                    to_purge.append(term)
            for term in to_purge:
                del doc_terms[term]
            cur.execute(
                "INSERT INTO lexica (document_id, counter) VALUES (?, ?)",
                (doc_id, gzip.compress(json.dumps(doc_terms).encode('utf8'))))