def generate_corpusfile(corpus_fname, n, out):
'''
Args:
corpus_fname: corpus file name
n: the number of chunks to split. In other words, "n" for "n-gram"
out: output corpus file path
Description:
Protvec uses word2vec inside, and it requires to load corpus file
to generate corpus.
'''
f = open(out, "w")
for r in SeqIO.parse(corpus_fname, "fasta"):
ngram_patterns = split_ngrams(r.seq, n)
for ngram_pattern in ngram_patterns:
f.write(" ".join(ngram_pattern) + "\n")
sys.stdout.write(".")
f.close()
评论列表
文章目录