gempro.py 文件源码

python
阅读 21 收藏 0 点赞 0 评论 0

项目:ssbio 作者: SBRG 项目源码 文件源码
def set_representative_structure(self, seq_outdir=None, struct_outdir=None, pdb_file_type=None,
                                     engine='needle', always_use_homology=False, rez_cutoff=0.0,
                                     seq_ident_cutoff=0.5, allow_missing_on_termini=0.2,
                                     allow_mutants=True, allow_deletions=False,
                                     allow_insertions=False, allow_unresolved=True,
                                     clean=True, force_rerun=False):
        """Set all representative structure for proteins from a structure in the structures attribute.

        Each gene can have a combination of the following, which will be analyzed to set a representative structure.

            * Homology model(s)
            * Ranked PDBs
            * BLASTed PDBs

        If the ``always_use_homology`` flag is true, homology models are always set as representative when they exist.
        If there are multiple homology models, we rank by the percent sequence coverage.

        Args:
            seq_outdir (str): Path to output directory of sequence alignment files, must be set if GEM-PRO directories
                were not created initially
            struct_outdir (str): Path to output directory of structure files, must be set if GEM-PRO directories
                were not created initially
            pdb_file_type (str): ``pdb``, ``pdb.gz``, ``mmcif``, ``cif``, ``cif.gz``, ``xml.gz``, ``mmtf``, ``mmtf.gz`` -
                choose a file type for files downloaded from the PDB
            engine (str): ``biopython`` or ``needle`` - which pairwise alignment program to use.
                ``needle`` is the standard EMBOSS tool to run pairwise alignments.
                ``biopython`` is Biopython's implementation of needle. Results can differ!
            always_use_homology (bool): If homology models should always be set as the representative structure
            rez_cutoff (float): Resolution cutoff, in Angstroms (only if experimental structure)
            seq_ident_cutoff (float): Percent sequence identity cutoff, in decimal form
            allow_missing_on_termini (float): Percentage of the total length of the reference sequence which will be ignored
                when checking for modifications. Example: if 0.1, and reference sequence is 100 AA, then only residues
                5 to 95 will be checked for modifications.
            allow_mutants (bool): If mutations should be allowed or checked for
            allow_deletions (bool): If deletions should be allowed or checked for
            allow_insertions (bool): If insertions should be allowed or checked for
            allow_unresolved (bool): If unresolved residues should be allowed or checked for
            clean (bool): If structures should be cleaned
            force_rerun (bool): If sequence to structure alignment should be rerun

        """
        for g in tqdm(self.genes):
            repstruct = g.protein.set_representative_structure(seq_outdir=seq_outdir,
                                                               struct_outdir=struct_outdir,
                                                               pdb_file_type=pdb_file_type,
                                                               engine=engine,
                                                               rez_cutoff=rez_cutoff,
                                                               seq_ident_cutoff=seq_ident_cutoff,
                                                               always_use_homology=always_use_homology,
                                                               allow_missing_on_termini=allow_missing_on_termini,
                                                               allow_mutants=allow_mutants,
                                                               allow_deletions=allow_deletions,
                                                               allow_insertions=allow_insertions,
                                                               allow_unresolved=allow_unresolved,
                                                               clean=clean,
                                                               force_rerun=force_rerun)

        log.info('{}/{}: number of genes with a representative structure'.format(len(self.genes_with_a_representative_structure),
                                                                                 len(self.genes)))
        log.info('See the "df_representative_structures" attribute for a summary dataframe.')
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号