def set_representative_sequence(self, force_rerun=False):
"""Automatically consolidate loaded sequences (manual, UniProt, or KEGG) and set a single representative sequence.
Manually set representative sequences override all existing mappings. UniProt mappings override KEGG mappings
except when KEGG mappings have PDBs associated with them and UniProt doesn't.
Args:
force_rerun (bool): Set to True to recheck stored sequences
"""
# TODO: rethink use of multiple database sources - may lead to inconsistency with genome sources
sequence_missing = []
successfully_mapped_counter = 0
for g in tqdm(self.genes):
repseq = g.protein.set_representative_sequence(force_rerun=force_rerun)
if not repseq:
sequence_missing.append(g.id)
elif not repseq.sequence_file:
sequence_missing.append(g.id)
else:
successfully_mapped_counter += 1
log.info('{}/{}: number of genes with a representative sequence'.format(len(self.genes_with_a_representative_sequence),
len(self.genes)))
log.info('See the "df_representative_sequences" attribute for a summary dataframe.')
评论列表
文章目录