def gene_rna_pr_id(hamster_id,gmap_gff,out_fn):
'''this fnction get all gene rna pr id, including both refseq and gff information.
* hamster_id: a file that has all ids in hamster.gff file
* gmap_gff: gff results mapped using gmap
* out_fn:
'''
# rna accession in gff file
ham_id_df = pd.read_csv(hamster_id,sep='\t',header=0)
ham_id_df = ham_id_df.astype('str')
ham_id_df['TrAccess'] = ham_id_df['TrAccess'].map(lambda x: x.split('.')[0])
ham_id_df['PrAccess'] = ham_id_df['PrAccess'].map(lambda x: x.split('.')[0])
rna_gene_dic = ham_id_df.set_index('TrAccess')['GeneID'].to_dict()
rna_pr_dic = ham_id_df.set_index('TrAccess')['PrAccess'].to_dict()
#-------- read rna gff file
rna_df = pd.read_csv(gmap_gff,sep='\t',header=None,comment='#')
# add rna accession column
rna_df['rna_ac'] = rna_df[8].map(lambda x: re.search('(?<=ID=).+?(?=\.)',x).group(0))
mrna = list(set(rna_df['rna_ac'].tolist()))
# new rna in refseq compared to gff
new_ref_rna = list(set(mrna) - set(rna_gene_dic.keys()))
# get geneid for new ref_rna gene id
for r in new_ref_rna:
handle = Entrez.efetch(db='nucleotide',id=r,rettype='gb',retmode='text').read()
geneid = re.search('(?<=GeneID:).+?(?=\")',handle).group(0)
try:
p = re.search('(?<=protein_id=\").+?(?=\.)',handle).group(0)
except:
p = '-'
rna_gene_dic[r] = geneid
rna_pr_dic[r] = p
# transfer dic to dataframe
r_g_df = pd.DataFrame.from_dict(rna_gene_dic,'index')
r_g_df.columns = ['geneid']
r_p_df = pd.DataFrame.from_dict(rna_pr_dic,'index')
r_p_df.columns = ['pr_ac']
g_r_p_df = pd.concat([r_g_df,r_p_df],axis=1)
g_r_p_df['rna_ac'] = g_r_p_df.index
g_r_p_df[['geneid','rna_ac','pr_ac']].to_csv(out_fn,sep='\t',index=False)
评论列表
文章目录