atlas.py 文件源码

python
阅读 35 收藏 0 点赞 0 评论 0

项目:ssbio 作者: SBRG 项目源码 文件源码
def _filter_orthology_matrix(self,
                                 remove_strains_with_no_orthology=True,
                                 remove_strains_with_no_differences=False,
                                 remove_genes_not_in_base_model=True):
        """Filters the orthology matrix by removing genes not in our base model, and also
            removes strains from the analysis which have: 0 orthologous genes or no difference from the base strain.

        Args:
            remove_strains_with_no_orthology (bool): Remove strains which have no orthologous genes found
            remove_strains_with_no_differences (bool): Remove strains which have all the same genes as the base model.
                Default is False because since orthology is found using a PID cutoff, all genes may be present but
                differences may be on the sequence level.
            remove_genes_not_in_base_model (bool): Remove genes from the orthology matrix which are not present in our
                base model. This happens if we use a genome file for our model that has other genes in it.

        """

        if len(self.df_orthology_matrix) == 0:
            raise RuntimeError('Empty orthology matrix')

        initial_num_strains = len(self.strains)

        # Adding names to the row and column of the orthology matrix
        self.df_orthology_matrix = self.df_orthology_matrix.rename_axis('gene').rename_axis("strain", axis="columns")

        # Gene filtering (of the orthology matrix)
        if remove_genes_not_in_base_model:
            # Check for gene IDs that are in the model and not in the orthology matrix
            # This is probably because: the CDS FASTA file for the base strain did not contain the correct ID
            # for the gene and consequently was not included in the orthology matrix
            # Save these and report them
            reference_strain_gene_ids = [x.id for x in self.reference_gempro.genes]
            self.missing_in_orthology_matrix = [x for x in reference_strain_gene_ids if x not in self.df_orthology_matrix.index.tolist()]
            self.missing_in_reference_strain = [y for y in self.df_orthology_matrix.index.tolist() if y not in reference_strain_gene_ids]

            # Filter the matrix for genes within our base model only
            self.df_orthology_matrix = self.df_orthology_matrix[self.df_orthology_matrix.index.isin(reference_strain_gene_ids)]
            log.info('Filtered orthology matrix for genes present in base model')
            log.warning('{} genes are in your base model but not your orthology matrix, see the attribute "missing_in_orthology_matrix"'.format(len(self.missing_in_orthology_matrix)))
            log.warning('{} genes are in the orthology matrix but not your base model, see the attribute "missing_in_reference_strain"'.format(len(self.missing_in_reference_strain)))

        # Strain filtering
        for strain_gempro in self.strains.copy():
            if remove_strains_with_no_orthology:
                if strain_gempro.id not in self.df_orthology_matrix.columns:
                    self.strains.remove(strain_gempro)
                    log.info('{}: no orthologous genes found for this strain, removed from analysis.'.format(strain_gempro.id))
                    continue
                elif self.df_orthology_matrix[strain_gempro.id].isnull().all():
                    self.strains.remove(strain_gempro)
                    log.info('{}: no orthologous genes found for this strain, removed from analysis.'.format(strain_gempro.id))
                    continue

            if remove_strains_with_no_differences:
                not_in_strain = self.df_orthology_matrix[pd.isnull(self.df_orthology_matrix[strain_gempro.id])][strain_gempro.id].index.tolist()
                if len(not_in_strain) == 0:
                    self.strains.remove(strain_gempro)
                    log.info('{}: strain has no differences from the base, removed from analysis.')
                    continue

        log.info('{} strains to be analyzed, {} strains removed'.format(len(self.strains), initial_num_strains - len(self.strains)))
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号