index.py 文件源码

python
阅读 20 收藏 0 点赞 0 评论 0

项目:hocrviewer-mirador 作者: jbaiter 项目源码 文件源码
def _update_search_index(self, doc_id, autocomplete_min_count):
        # FIXME: This is a bit unwiedly and I'd prefer there was a nicely
        #        scalable in-SQL solution, but unfortunately keeping the
        #        term frequencies for each document in a table makes
        #        the database size explode, so gzipped json-dumped counters
        #        it is for now :/
        with self._db as cur:
            terms_before = Counter(dict(
                cur.execute("SELECT term, cnt FROM text_vocab").fetchall()))
            cur.execute(UPDATE_INDEX_SINGLE_DOCUMENT, {'document_id': doc_id})
            terms_after = Counter(dict(
                cur.execute("SELECT term, cnt FROM text_vocab").fetchall()))
            doc_terms = Counter(dict(
                (term, cnt_after - terms_before.get('term', 0))
                for term, cnt_after in terms_after.items()
                if cnt_after != terms_before.get('term')))
            # Purge terms below threshold to save on size
            to_purge = []
            for term, cnt in doc_terms.items():
                if cnt < autocomplete_min_count:
                    to_purge.append(term)
            for term in to_purge:
                del doc_terms[term]
            cur.execute(
                "INSERT INTO lexica (document_id, counter) VALUES (?, ?)",
                (doc_id, gzip.compress(json.dumps(doc_terms).encode('utf8'))))
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号