def build_vocab(language, corpus_files_root):
corpus_dir = CORPUS_DIR.format(lang=language)
local("mkdir -p {}".format(corpus_dir))
model_dir = MODEL_DIR.format(lang=language)
local("mkdir -p {}".format(model_dir))
corpus_file = join(corpus_dir, "{}_wiki.corpus".format(language))
merge_corpus(corpus_files_root, corpus_file)
word_freq_path = join(model_dir, "{}_wiki.freqs".format(language))
word_counts(corpus_files_root + "/*", word_freq_path)
word2vec_model_path = join(model_dir, "{}_wiki.word2vec".format(language))
word2vec(corpus_file, word2vec_model_path)
brown_out_dir = join(model_dir, "brown")
brown_clusters(corpus_file, brown_out_dir)
init_vocab(language, model_dir, word_freq_path, word2vec_model_path, brown_out_dir)
评论列表
文章目录