def create_traindata(self, outdir):
trees = JaCCGReader(self.filepath).readall()
# first construct dictionaries only
for tree in trees:
self._traverse(tree)
# construct training samples with
# categories whose frequency >= freq_cut.
for tree in trees:
tokens = get_leaves(tree)
words = [token.word for token in tokens]
self.sents.append(" ".join(words))
cats = [token.cat.without_semantics for token in tokens]
samples = get_context_by_window(
words, CONTEXT, lpad=LPAD, rpad=RPAD)
assert len(samples) == len(cats)
for cat, sample in zip(cats, samples):
if self.cats[cat] >= self.cat_freq_cut:
self.samples[" ".join(sample)] = cat
self.cats = {k: v for (k, v) in self.cats.items() \
if v >= self.cat_freq_cut}
self.words = {k: v for (k, v) in self.words.items() \
if v >= self.word_freq_cut}
with open(outdir + "/unary_rules.txt", "w") as f:
self._write(self.unary_rules, f, comment_out_value=True)
with open(outdir + "/seen_rules.txt", "w") as f:
self._write(self.seen_rules, f, comment_out_value=True)
with open(outdir + "/target.txt", "w") as f:
self._write(self.cats, f, comment_out_value=False)
with open(outdir + "/words.txt", "w") as f:
self._write(self.words, f, comment_out_value=False)
with open(outdir + "/chars.txt", "w") as f:
self._write(self.chars, f, comment_out_value=False)
with open(outdir + "/traindata.json", "w") as f:
json.dump(self.samples, f)
with open(outdir + "/trainsents.txt", "w") as f:
for sent in self.sents:
f.write(sent.encode("utf-8") + "\n")
评论列表
文章目录