def load_word2emb(self, show_progress=True, batch_size=1000):
fin_name = self.ensure_file(path.join('fasttext', '{}.zip'.format(self.lang)), url=self.url.format(self.lang))
seen = set()
with zipfile.ZipFile(fin_name) as fin:
content = fin.read('wiki.{}.vec'.format(self.lang))
lines = content.splitlines()
if show_progress:
lines = tqdm(lines)
batch = []
for line in lines:
elems = line.decode().rstrip().split()
vec = [float(n) for n in elems[-self.d_emb:]]
word = ' '.join(elems[:-self.d_emb])
if word in seen:
continue
seen.add(word)
batch.append((word, vec))
if len(batch) == batch_size:
self.insert_batch(batch)
batch.clear()
if batch:
self.insert_batch(batch)
评论列表
文章目录