def extract_kmer_seq(kmer_bedfile, hg19_fa_file, outfile_kmers_seq):
'''
open bed file to write the kmer sequence
'''
with open(outfile_kmers_seq,'w') as of:
pass
wf = open(outfile_kmers_seq,'a+')
# read names and postions from bed file
records = SeqIO.to_dict(SeqIO.parse(open(hg19_fa_file), 'fasta'))
with open(kmer_bedfile) as rf:
for line in rf:
name,start,stop,gene,val,strand = bedline_split(line)
long_seq_record = records[name]
long_seq = long_seq_record.seq
alphabet = long_seq.alphabet
short_seq = str(long_seq)[start-1:stop]
short_seq_record = SeqRecord(Seq(short_seq, alphabet))
short_seq_record.seq.strip()
'''
If the sequence is in reverse strand, rev complement it
#edit 14-march-2017 (to extract kmers strictly to half open bed format)
'''
if strand == "+":
kmer_seq = short_seq_record.seq
#kmer created will be one base greater than the kmer size, trim
kmer_seq = kmer_seq[:1]
elif strand == "-":
kmer_seq = short_seq_record.seq.reverse_complement()
kmer_seq = kmer_seq[:-1]
else:
print("Strand information not found for line:\n {}".format(line))
quit()
wf.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(name,start,stop,gene,kmer_seq,strand))
rf.close()
wf.close()
file_chk = check_files(outfile_kmers_seq)
return(file_chk)
评论列表
文章目录