def get_evm_pr(evm_path,ref_fa,out_path):
'''this function get all evm proteins, output to files and merge them together
* evm_path: evm path that has gff file
* ref_fa: reference fa file
* out_path: path to save all temperary files and final protein files
'''
if os.path.exists(out_path):
shutil.rmtree(out_path)
os.mkdir(out_path)
os.chdir(out_path)
evm_gff= evm_path + '/evm.merge.gff'
gff_df = pd.read_csv(evm_gff,sep='\t',header=None)
dic = SeqIO.index(ref_fa,'fasta')
cds_df = gff_df[gff_df[2].values=='CDS']
cds_df = cds_df.reset_index(drop=True)
cds_df['rna_id'] = cds_df[8].map(lambda x: x.split(';')[1][7:])
scaffolds = list(set(cds_df[0].tolist()))
for scaff in scaffolds:
output_cds(scaff,cds_df,dic)
# merge files
fns = natsorted(glob.glob('*.fa'))
sarge.run('cat {fns} > {out}'.format(fns=' '.join(fns),out='pr_merge.fa'))
for f in fns:
os.remove(f)
评论列表
文章目录