def _filter_orthology_matrix(self,
remove_strains_with_no_orthology=True,
remove_strains_with_no_differences=False,
remove_genes_not_in_base_model=True):
"""Filters the orthology matrix by removing genes not in our base model, and also
removes strains from the analysis which have: 0 orthologous genes or no difference from the base strain.
Args:
remove_strains_with_no_orthology (bool): Remove strains which have no orthologous genes found
remove_strains_with_no_differences (bool): Remove strains which have all the same genes as the base model.
Default is False because since orthology is found using a PID cutoff, all genes may be present but
differences may be on the sequence level.
remove_genes_not_in_base_model (bool): Remove genes from the orthology matrix which are not present in our
base model. This happens if we use a genome file for our model that has other genes in it.
"""
if len(self.df_orthology_matrix) == 0:
raise RuntimeError('Empty orthology matrix')
initial_num_strains = len(self.strains)
# Adding names to the row and column of the orthology matrix
self.df_orthology_matrix = self.df_orthology_matrix.rename_axis('gene').rename_axis("strain", axis="columns")
# Gene filtering (of the orthology matrix)
if remove_genes_not_in_base_model:
# Check for gene IDs that are in the model and not in the orthology matrix
# This is probably because: the CDS FASTA file for the base strain did not contain the correct ID
# for the gene and consequently was not included in the orthology matrix
# Save these and report them
reference_strain_gene_ids = [x.id for x in self.reference_gempro.genes]
self.missing_in_orthology_matrix = [x for x in reference_strain_gene_ids if x not in self.df_orthology_matrix.index.tolist()]
self.missing_in_reference_strain = [y for y in self.df_orthology_matrix.index.tolist() if y not in reference_strain_gene_ids]
# Filter the matrix for genes within our base model only
self.df_orthology_matrix = self.df_orthology_matrix[self.df_orthology_matrix.index.isin(reference_strain_gene_ids)]
log.info('Filtered orthology matrix for genes present in base model')
log.warning('{} genes are in your base model but not your orthology matrix, see the attribute "missing_in_orthology_matrix"'.format(len(self.missing_in_orthology_matrix)))
log.warning('{} genes are in the orthology matrix but not your base model, see the attribute "missing_in_reference_strain"'.format(len(self.missing_in_reference_strain)))
# Strain filtering
for strain_gempro in self.strains.copy():
if remove_strains_with_no_orthology:
if strain_gempro.id not in self.df_orthology_matrix.columns:
self.strains.remove(strain_gempro)
log.info('{}: no orthologous genes found for this strain, removed from analysis.'.format(strain_gempro.id))
continue
elif self.df_orthology_matrix[strain_gempro.id].isnull().all():
self.strains.remove(strain_gempro)
log.info('{}: no orthologous genes found for this strain, removed from analysis.'.format(strain_gempro.id))
continue
if remove_strains_with_no_differences:
not_in_strain = self.df_orthology_matrix[pd.isnull(self.df_orthology_matrix[strain_gempro.id])][strain_gempro.id].index.tolist()
if len(not_in_strain) == 0:
self.strains.remove(strain_gempro)
log.info('{}: strain has no differences from the base, removed from analysis.')
continue
log.info('{} strains to be analyzed, {} strains removed'.format(len(self.strains), initial_num_strains - len(self.strains)))
评论列表
文章目录