def get_scratch_predictions(self, path_to_scratch, results_dir, scratch_basename='scratch', num_cores=1,
exposed_buried_cutoff=25, custom_gene_mapping=None):
"""Run and parse ``SCRATCH`` results to predict secondary structure and solvent accessibility.
Annotations are stored in the protein's representative sequence at:
* ``.annotations``
* ``.letter_annotations``
Args:
path_to_scratch (str): Path to SCRATCH executable
results_dir (str): Path to SCRATCH results folder, which will have the files (scratch.ss, scratch.ss8,
scratch.acc, scratch.acc20)
scratch_basename (str): Basename of the SCRATCH results ('scratch' is default)
num_cores (int): Number of cores to use to parallelize SCRATCH run
exposed_buried_cutoff (int): Cutoff of exposed/buried for the acc20 predictions
custom_gene_mapping (dict): Default parsing of SCRATCH output files is to look for the model gene IDs. If
your output files contain IDs which differ from the model gene IDs, use this dictionary to map model
gene IDs to result file IDs. Dictionary keys must match model genes.
"""
if not self.genome_path:
# Write all sequences as one file
all_seqs = self.write_representative_sequences_file(outname=self.id)
# Runs SCRATCH or loads existing results in results_dir
scratch = SCRATCH(project_name=scratch_basename, seq_file=self.genome_path)
scratch.run_scratch(path_to_scratch=path_to_scratch, num_cores=num_cores, outdir=results_dir)
counter = 0
# Adding the scratch annotations to the representative_sequences letter_annotations
for g in tqdm(self.genes_with_a_representative_sequence):
if custom_gene_mapping:
g_id = custom_gene_mapping[g.id]
else:
g_id = g.id
if g_id in scratch.sspro_summary():
# Secondary structure
g.protein.representative_sequence.annotations.update(scratch.sspro_summary()[g_id])
g.protein.representative_sequence.annotations.update(scratch.sspro8_summary()[g_id])
g.protein.representative_sequence.letter_annotations['SS-sspro'] = scratch.sspro_results()[g_id]
g.protein.representative_sequence.letter_annotations['SS-sspro8'] = scratch.sspro8_results()[g_id]
# Solvent accessibility
g.protein.representative_sequence.annotations.update(scratch.accpro_summary()[g_id])
g.protein.representative_sequence.annotations.update(scratch.accpro20_summary(exposed_buried_cutoff)[g_id])
g.protein.representative_sequence.letter_annotations['RSA-accpro'] = scratch.accpro_results()[g_id]
g.protein.representative_sequence.letter_annotations['RSA-accpro20'] = scratch.accpro20_results()[g_id]
counter += 1
else:
log.error('{}: missing SCRATCH results'.format(g.id))
log.info('{}/{}: number of genes with SCRATCH predictions loaded'.format(counter, len(self.genes)))
评论列表
文章目录