def extract_genes(output_folder, refs_folder, gff_folder, di):
tp = 'gene'
refs_files = sorted([join(refs_folder, f) for f in listdir(refs_folder) if isfile(join(refs_folder, f))])
gff_files = sorted([join(gff_folder, f) for f in listdir(gff_folder) if isfile(join(gff_folder, f))])
limit_info = dict( gff_type =['gene'])
sequences = []
for ind in range(len(refs_files)):
f_fasta = refs_files[ind]
f_gff = gff_files[ind]
nm = f_gff.split("/")[-1][:-4]
record = SeqIO.to_dict(SeqIO.parse(f_fasta, "fasta"))
new_record = {}
p = re.compile("N\w_\w+\.\d")
for it in record:
new_id = p.findall(it)[0]
new_record[new_id] = record[it]
record = new_record
in_handle = open(f_gff)
for rec in GFF.parse(in_handle, limit_info=limit_info, base_dict=record):
for f in rec.features:
if f.qualifiers['ID'][0] in di.get(f_gff, []):
subrecord = SeqRecord.SeqRecord(Seq.Seq(re.sub(r'[^ATGC]', 'A', str(f.extract(rec.seq)))),
id=nm + "_" + rec.id + "_" + str(f.location),
name=nm + "_" + str(f.location)+ "_" + tp,
description= tp)
sequences.append(subrecord)
in_handle.close()
with open(output_folder+'sc_genes_seq.fasta', "w") as output_handle:
SeqIO.write(sequences, output_handle, "fasta")
评论列表
文章目录