def train_rnas(seq_file = 'utrs.fa', outfile= 'rnadocEmbedding25.pickle'):
min_count = 5
dim = 50
window = 5
print('dim: ' + str(dim) + ', window: ' + str(window))
seq_dict = read_fasta_file(seq_file)
#text = seq_dict.values()
tris = get_6_trids()
sentences = []
for seq in seq_dict.values():
seq = seq.replace('T', 'U')
bag_sen = []
bag_seqs = split_overlap_seq(seq)
for new_seq in bag_seqs:
trvec = get_4_nucleotide_composition(tris, new_seq)
bag_sen.append(trvec)
#for aa in range(len(text)):
sentences.append(bag_sen)
#pdb.set_trace()
print(len(sentences))
model = None
docs = train_tag_doc(sentences)
#model = Word2Vec(sentences, min_count=min_count, size=dim, window=window, sg=1, iter = 10, batch_words=100)
#model = gensim.models.doc2vec.Doc2Vec(docs, size = 50, window = 300, min_count = min_count, workers = 4)
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=min_count, iter=50)
model.build_vocab(docs)
model.train(docs)
'''vocab = list(model.vocab.keys())
print vocab
fw = open('rna_doc_dict', 'w')
for val in vocab:
fw.write(val + '\n')
fw.close()
#print model.syn0
#pdb.set_trace()
embeddingWeights = np.empty([len(vocab), dim])
for i in range(len(vocab)):
embeddingWeights[i,:] = model[vocab[i]]
allWeights.append(embeddingWeights)
'''
#model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])
#with open(outfile, 'w') as f:
# pickle.dump(model, f)
# store the model to mmap-able files
pdb.set_trace()
model.save(outfile)
# load the model back
#model_loaded = Doc2Vec.load(outfile)
评论列表
文章目录