def step6():
for tipe in ["news", "nocturne"]:
names = [name for name in reversed(sorted(glob.glob("./tmp/tmp.{tipe}.*.json".format(tipe=tipe))))]
size = len(names)
for en, name in enumerate(names):
term_clus = {}
oss = []
with open(name) as f:
for line in f:
line = line.strip()
oss.append(json.loads(line))
for i in range(3, len(oss) - 3):
terms = set( oss[i]["txt"] )
for term in terms:
if term_clus.get(term) is None:
term_clus[term] = [0.0]*128
cd = [oss[i+d]["cluster"][0] for d in [-3, -2, -1, 1, 2, 3]]
for c in cd:
term_clus[term][c] += 1.0
print("{}/{} finished {}".format(en, size, name))
open("{tipe}.term_clus.pkl".format(tipe=tipe), "wb").write( pickle.dumps(term_clus) )
评论列表
文章目录