def load_dna_and_chrom_label(args, only_labels=None):
record_dict = SeqIO.to_dict(SeqIO.parse(args.reference_fasta, "fasta"))
bed_dict, bed_labels = bed_file_labels_to_dict(args.bed_file)
train_data = np.zeros(( args.samples, args.window_size, len(args.inputs) ))
train_labels = np.zeros(( args.samples, len(bed_labels) ))
idx_offset = (args.window_size/2)
amiguity_codes = {'K':[0,0,0.5,0.5], 'M':[0.5,0.5,0,0], 'R':[0.5,0,0,0.5], 'Y':[0,0.5,0.5,0], 'S':[0,0.5,0,0.5],
'W':[0.5,0,0.5,0], 'B':[0,0.333,0.333,0.334], 'V':[0.333,0.333,0,0.334],'H':[0.333,0.333,0.334,0],
'D':[0.333,0,0.333,0.334], 'X':[0.25,0.25,0.25,0.25], 'N':[0.25,0.25,0.25,0.25]}
count = 0
while count < args.samples:
contig_key, pos = sample_from_bed(bed_dict, contig_key_prefix='chr')
contig = record_dict[contig_key]
record = contig[pos-idx_offset: pos+idx_offset]
cur_label_key = bed_file_label(bed_dict, contig_key, pos)
if only_labels and not cur_label_key in only_labels:
continue
train_labels[count, args.labels[cur_label_key]] = 1
for i,b in enumerate(record.seq):
B=b.upper()
if B in args.inputs.keys():
train_data[count, i, args.inputs[B]] = 1.0
elif B in amiguity_codes.keys():
train_data[count, i, :4] = amiguity_codes[B]
else:
print('Error! Unknown code:', b)
return
count += 1
print('Label:', bed_labels.keys(), 'label counts:', np.sum(train_labels, axis=0))
print('Train data shape:', train_data.shape, ' Training labels shape:', train_labels.shape)
return (train_data, train_labels)
chrom_hmm_cnn.py 文件源码
python
阅读 26
收藏 0
点赞 0
评论 0
评论列表
文章目录