def kegg_mapping_and_metadata(self, kegg_organism_code, custom_gene_mapping=None, outdir=None,
set_as_representative=False, force_rerun=False):
"""Map all genes in the model to KEGG IDs using the KEGG service.
Steps:
1. Download all metadata and sequence files in the sequences directory
2. Creates a KEGGProp object in the protein.sequences attribute
3. Returns a Pandas DataFrame of mapping results
Args:
kegg_organism_code (str): The three letter KEGG code of your organism
custom_gene_mapping (dict): If your model genes differ from the gene IDs you want to map,
custom_gene_mapping allows you to input a dictionary which maps model gene IDs to new ones.
Dictionary keys must match model gene IDs.
outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
were not created initially
set_as_representative (bool): If mapped KEGG IDs should be set as representative sequences
force_rerun (bool): If you want to overwrite any existing mappings and files
"""
# First map all of the organism's KEGG genes to UniProt
kegg_to_uniprot = ssbio.databases.kegg.map_kegg_all_genes(organism_code=kegg_organism_code, target_db='uniprot')
successfully_mapped_counter = 0
for g in tqdm(self.genes):
if custom_gene_mapping:
kegg_g = custom_gene_mapping[g.id]
else:
kegg_g = g.id
# Download both FASTA and KEGG metadata files
kegg_prop = g.protein.load_kegg(kegg_id=kegg_g, kegg_organism_code=kegg_organism_code,
download=True, outdir=outdir, set_as_representative=set_as_representative,
force_rerun=force_rerun)
# Update potentially old UniProt ID
if kegg_g in kegg_to_uniprot.keys():
kegg_prop.uniprot = kegg_to_uniprot[kegg_g]
if g.protein.representative_sequence:
if g.protein.representative_sequence.kegg == kegg_prop.kegg:
g.protein.representative_sequence.uniprot = kegg_to_uniprot[kegg_g]
# Keep track of missing mappings - missing is defined by no available sequence
if kegg_prop.sequence_file:
successfully_mapped_counter += 1
log.debug('{}: loaded KEGG information for gene'.format(g.id))
log.info('{}/{}: number of genes mapped to KEGG'.format(successfully_mapped_counter, len(self.genes)))
log.info('Completed ID mapping --> KEGG. See the "df_kegg_metadata" attribute for a summary dataframe.')
评论列表
文章目录