japanese_tagger.py 文件源码-python代码片段

def create_traindata(self, outdir):
        trees = JaCCGReader(self.filepath).readall()
        # first construct dictionaries only
        for tree in trees:
            self._traverse(tree)
        # construct training samples with
        # categories whose frequency >= freq_cut.
        for tree in trees:
            tokens = get_leaves(tree)
            words = [token.word for token in tokens]
            self.sents.append(" ".join(words))
            cats = [token.cat.without_semantics for token in tokens]
            samples = get_context_by_window(
                    words, CONTEXT, lpad=LPAD, rpad=RPAD)
            assert len(samples) == len(cats)
            for cat, sample in zip(cats, samples):
                if self.cats[cat] >= self.cat_freq_cut:
                    self.samples[" ".join(sample)] = cat

        self.cats = {k: v for (k, v) in self.cats.items() \
                        if v >= self.cat_freq_cut}
        self.words = {k: v for (k, v) in self.words.items() \
                        if v >= self.word_freq_cut}
        with open(outdir + "/unary_rules.txt", "w") as f:
            self._write(self.unary_rules, f, comment_out_value=True)
        with open(outdir + "/seen_rules.txt", "w") as f:
            self._write(self.seen_rules, f, comment_out_value=True)
        with open(outdir + "/target.txt", "w") as f:
            self._write(self.cats, f, comment_out_value=False)
        with open(outdir + "/words.txt", "w") as f:
            self._write(self.words, f, comment_out_value=False)
        with open(outdir + "/chars.txt", "w") as f:
            self._write(self.chars, f, comment_out_value=False)
        with open(outdir + "/traindata.json", "w") as f:
            json.dump(self.samples, f)
        with open(outdir + "/trainsents.txt", "w") as f:
            for sent in self.sents:
                f.write(sent.encode("utf-8") + "\n")