def load_word2emb(self, show_progress=True, batch_size=1000):
fin_name = self.ensure_file('kazuma.tar.gz', url=self.url)
seen = set()
with tarfile.open(fin_name, 'r:gz') as fzip:
ftxt = fzip.extractfile('charNgram.txt')
content = ftxt.read()
ftxt.close()
lines = content.splitlines()
if show_progress:
lines = tqdm(lines)
batch = []
for line in lines:
elems = line.decode().rstrip().split()
vec = [float(n) for n in elems[-self.d_emb:]]
word = ' '.join(elems[:-self.d_emb])
if word in seen:
continue
seen.add(word)
batch.append((word, vec))
if len(batch) == batch_size:
self.insert_batch(batch)
batch.clear()
if batch:
self.insert_batch(batch)
评论列表
文章目录