Genome_Annotation.py 文件源码-python代码片段

Genome_Annotation.py 文件源码

python

阅读 22 收藏 0 点赞 0 评论 0

项目：NGS-Pipeline 作者: LewisLabUCSD 项目源码文件源码

def get_evm_pr(evm_path,ref_fa,out_path):
    '''this function get all evm proteins, output to files and merge them together
    * evm_path: evm path that has gff file
    * ref_fa: reference fa file
    * out_path: path to save all temperary files and final protein files
    '''
    if os.path.exists(out_path): 
        shutil.rmtree(out_path)
    os.mkdir(out_path)
    os.chdir(out_path)
    evm_gff= evm_path + '/evm.merge.gff'
    gff_df = pd.read_csv(evm_gff,sep='\t',header=None)
    dic = SeqIO.index(ref_fa,'fasta')
    cds_df = gff_df[gff_df[2].values=='CDS']
    cds_df = cds_df.reset_index(drop=True)
    cds_df['rna_id'] = cds_df[8].map(lambda x: x.split(';')[1][7:])
    scaffolds = list(set(cds_df[0].tolist()))
    for scaff in scaffolds:
        output_cds(scaff,cds_df,dic)
    # merge files
    fns = natsorted(glob.glob('*.fa'))
    sarge.run('cat {fns} > {out}'.format(fns=' '.join(fns),out='pr_merge.fa'))
    for f in fns:
        os.remove(f)