def add_gene_function(blast_db,evm_path):
'''add gene symbol to gff file. the information is from the blast results
'''
blastp_fn = blast_db + '/blastp.txt'
blast_df = pd.read_csv(blastp_fn,sep='\t',usecols=[0,1,2],names=['ref','query','per'])
blast_df = blast_df[blast_df['per'].values>50]
blast_df['rna'] = blast_df['ref'].map(lambda x: '.'.join(x.split('.')[-2:]))
blast_df['pr'] = blast_df['query'].map(lambda x: x.split('|')[-1].split('_')[0])
rna_pr_dic = blast_df.set_index('rna')['pr'].to_dict()
evm_gff= evm_path + '/evm.merge.gff'
gff_df = pd.read_csv(evm_gff,sep='\t',header=None)
gff_df[8] = gff_df[8].map(lambda x: add_gene_name(x,rna_pr_dic))
gff_df = gff_df[~gff_df[8].map(lambda x: 'gene=LORF2' in x)]
gff_df.to_csv(blast_db +'/final.gff',sep='\t',index=False)
# add_gene_function(blast_db,evm_path)
#===============================================================================
# process the gmap results and exonerates results directly
#===============================================================================
#=============== 1. get all mapped geneid, rna_accession, pr_accession
评论列表
文章目录