def _update_search_index(self, doc_id, autocomplete_min_count):
# FIXME: This is a bit unwiedly and I'd prefer there was a nicely
# scalable in-SQL solution, but unfortunately keeping the
# term frequencies for each document in a table makes
# the database size explode, so gzipped json-dumped counters
# it is for now :/
with self._db as cur:
terms_before = Counter(dict(
cur.execute("SELECT term, cnt FROM text_vocab").fetchall()))
cur.execute(UPDATE_INDEX_SINGLE_DOCUMENT, {'document_id': doc_id})
terms_after = Counter(dict(
cur.execute("SELECT term, cnt FROM text_vocab").fetchall()))
doc_terms = Counter(dict(
(term, cnt_after - terms_before.get('term', 0))
for term, cnt_after in terms_after.items()
if cnt_after != terms_before.get('term')))
# Purge terms below threshold to save on size
to_purge = []
for term, cnt in doc_terms.items():
if cnt < autocomplete_min_count:
to_purge.append(term)
for term in to_purge:
del doc_terms[term]
cur.execute(
"INSERT INTO lexica (document_id, counter) VALUES (?, ?)",
(doc_id, gzip.compress(json.dumps(doc_terms).encode('utf8'))))
评论列表
文章目录