def pdb_downloader_and_metadata(self, outdir=None, pdb_file_type=None, force_rerun=False):
"""Download ALL mapped experimental structures to each protein's structures directory.
Args:
outdir (str): Path to output directory, if GEM-PRO directories were not set or other output directory is
desired
pdb_file_type (str): Type of PDB file to download, if not already set or other format is desired
force_rerun (bool): If files should be re-downloaded if they already exist
"""
if not pdb_file_type:
pdb_file_type = self.pdb_file_type
counter = 0
for g in tqdm(self.genes):
pdbs = g.protein.pdb_downloader_and_metadata(outdir=outdir, pdb_file_type=pdb_file_type, force_rerun=force_rerun)
if pdbs:
counter += len(pdbs)
log.info('Updated PDB metadata dataframe. See the "df_pdb_metadata" attribute for a summary dataframe.')
log.info('Saved {} structures total'.format(counter))
python类tqdm_notebook()的实例源码
def tqdm_callback(N, notebook=True):
"""
Return a :module:`tqdm` progress bar expecting *N* iterations,
either suitable with jupyter if *notebook* is true and for the
terminal otherwise. The progress bar includes an additional method
:function:`callback` (function of one ignored parameter) meant to
be past as a callback function called to update the progress bar.
"""
if notebook:
progress_bar = tqdm.tqdm_notebook(total=N)
else:
progress_bar = tqdm.tqdm(total=N)
def callback(self, i):
self.update()
progress_bar.callback = partial(callback, progress_bar)
return progress_bar
def __download_competition_file(self, competition, file_name, browser):
url = 'https://www.kaggle.com/c/%s/download/%s' % (competition, file_name)
res = browser.get(url, stream=True)
total_size = int(res.headers.get('content-length', 0));
if res.status_code != 200:
print('error downloading %s' % file_name)
return False
file_name = os.path.basename(url)
pbar = tqdm(total=total_size, unit='B', unit_scale=True, desc=file_name)
chunk_size = 32 * 1024
with open(file_name, 'wb') as file_handle:
for data in res.iter_content(chunk_size):
file_handle.write(data)
pbar.update(chunk_size)
return True
def reset(self, stream=None):
""" Reset the progress bar(s) """
logger.debug("Update stream descriptor for progress bars.")
if stream is not None:
self.stream = stream
if self.stream is None:
logger.warning("No stream is associated with the progress bars!")
self.axes = []
else:
self.axes = self.stream.descriptor.axes
self.num = min(self.num, len(self.axes))
self.totals = [self.stream.descriptor.num_points_through_axis(axis) for axis in range(self.num)]
self.chunk_sizes = [max(1,self.stream.descriptor.num_points_through_axis(axis+1)) for axis in range(self.num)]
logger.debug("Reset the progress bars to initial states.")
self.bars = []
for i in range(self.num):
if self.notebook:
self.bars.append(tqdm_notebook(total=self.totals[i]/self.chunk_sizes[i]))
else:
self.bars.append(tqdm(total=self.totals[i]/self.chunk_sizes[i]))
def update(self):
""" Update the status of the progress bar(s) """
if self.stream is None:
logger.warning("No stream is associated with the progress bars!")
num_data = 0
else:
num_data = self.stream.points_taken
logger.debug("Update the progress bars.")
for i in range(self.num):
if num_data == 0:
# Reset the progress bar with a new one
if self.notebook:
self.bars[i].sp(close=True)
self.bars[i] = tqdm_notebook(total=self.totals[i]/self.chunk_sizes[i])
else:
self.bars[i].close()
self.bars[i] = tqdm(total=self.totals[i]/self.chunk_sizes[i])
pos = int(10*num_data / self.chunk_sizes[i])/10.0 # One decimal is good enough
if pos > self.bars[i].n:
self.bars[i].update(pos - self.bars[i].n)
num_data = num_data % self.chunk_sizes[i]
def ka_bagging_2class_or_reg_lgbm(X_train, y_train, seed, bag_round, params
, X_test, using_notebook=True, num_boost_round=0):
'''
early version
'''
# create array object to hold predictions
baggedpred=np.zeros(shape=X_test.shape[0]).astype(np.float32)
#loop for as many times as we want bags
if using_notebook:
for n in tqdm_notebook(range(0, bag_round)):
#shuffle first, aids in increasing variance and forces different results
X_train, y_train=shuffle(X_train, y_train, random_state=seed+n)
params['seed'] = seed + n
model = lightgbm.train(params, lightgbm.Dataset(X_train, y_train), num_boost_round=num_boost_round)
pred = model.predict(X_test)
baggedpred += pred/bag_round
return baggedpred
def compute_sgd(data):
logging.info('Computing SGD')
n_splits = 10
folder = StratifiedKFold(n_splits=n_splits, shuffle=True)
for ix_first, ix_second in tqdm_notebook(folder.split(np.zeros(data['y_train'].shape[0]), data['y_train']),
total=n_splits):
# {'en__l1_ratio': 0.0001, 'en__alpha': 1e-05}
model = SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
n_iter=100,
shuffle=True,
n_jobs=-1,
l1_ratio=0.0001,
alpha=1e-05,
class_weight=None)
model = model.fit(data['X_train'][ix_first, :], data['y_train'][ix_first])
data['y_train_pred'][ix_second] = logit(model.predict_proba(data['X_train'][ix_second, :])[:, 1])
data['y_test_pred'].append(logit(model.predict_proba(data['X_test'])[:, 1]))
data['y_test_pred'] = np.array(data['y_test_pred']).T.mean(axis=1)
return data
def download_patric_genomes(self, ids, force_rerun=False):
"""Download genome files from PATRIC given a list of PATRIC genome IDs and load them as strains.
Args:
ids (str, list): PATRIC ID or list of PATRIC IDs
force_rerun (bool): If genome files should be downloaded again even if they exist
"""
ids = ssbio.utils.force_list(ids)
counter = 0
log.info('Downloading sequences from PATRIC...')
for patric_id in tqdm(ids):
f = ssbio.databases.patric.download_coding_sequences(patric_id=patric_id, seqtype='protein',
outdir=self.sequences_by_organism_dir,
force_rerun=force_rerun)
if f:
self.load_strain(patric_id, f)
counter += 1
log.debug('{}: downloaded sequence'.format(patric_id))
else:
log.warning('{}: unable to download sequence'.format(patric_id))
log.info('Created {} new strain GEM-PROs, accessible at "strains" attribute'.format(counter))
def build_strain_specific_models(self, save_models=False):
"""Using the orthologous genes matrix, create and modify the strain specific models based on if orthologous
genes exist.
Also store the sequences directly in the reference GEM-PRO protein sequence attribute for the strains.
"""
if len(self.df_orthology_matrix) == 0:
raise RuntimeError('Empty orthology matrix')
# Create an emptied copy of the reference GEM-PRO
for strain_gempro in tqdm(self.strains):
log.debug('{}: building strain specific model'.format(strain_gempro.id))
# For each genome, load the metabolic model or genes from the reference GEM-PRO
logging.disable(logging.WARNING)
if self._empty_reference_gempro.model:
strain_gempro.load_cobra_model(self._empty_reference_gempro.model)
elif self._empty_reference_gempro.genes:
strain_gempro.genes = [x.id for x in self._empty_reference_gempro.genes]
logging.disable(logging.NOTSET)
# Get a list of genes which do not have orthology in the strain
not_in_strain = self.df_orthology_matrix[pd.isnull(self.df_orthology_matrix[strain_gempro.id])][strain_gempro.id].index.tolist()
# Mark genes non-functional
self._pare_down_model(strain_gempro=strain_gempro, genes_to_remove=not_in_strain)
# Load sequences into the base and strain models
self._load_strain_sequences(strain_gempro=strain_gempro)
if save_models:
cobra.io.save_json_model(model=strain_gempro.model,
filename=op.join(self.model_dir, '{}.json'.format(strain_gempro.id)))
strain_gempro.save_pickle(op.join(self.model_dir, '{}_gp.pckl'.format(strain_gempro.id)))
log.info('Created {} new strain-specific models and loaded in sequences'.format(len(self.strains)))
def align_orthologous_genes_pairwise(self, gapopen=10, gapextend=0.5):
"""For each gene in the base strain, run a pairwise alignment for all orthologous gene sequences to it."""
for ref_gene in tqdm(self.reference_gempro.genes):
if len(ref_gene.protein.sequences) > 1:
alignment_dir = op.join(self.sequences_by_gene_dir, ref_gene.id)
if not op.exists(alignment_dir):
os.mkdir(alignment_dir)
ref_gene.protein.pairwise_align_sequences_to_representative(gapopen=gapopen, gapextend=gapextend,
outdir=alignment_dir, parse=True)
def set_representative_sequence(self, force_rerun=False):
"""Automatically consolidate loaded sequences (manual, UniProt, or KEGG) and set a single representative sequence.
Manually set representative sequences override all existing mappings. UniProt mappings override KEGG mappings
except when KEGG mappings have PDBs associated with them and UniProt doesn't.
Args:
force_rerun (bool): Set to True to recheck stored sequences
"""
# TODO: rethink use of multiple database sources - may lead to inconsistency with genome sources
sequence_missing = []
successfully_mapped_counter = 0
for g in tqdm(self.genes):
repseq = g.protein.set_representative_sequence(force_rerun=force_rerun)
if not repseq:
sequence_missing.append(g.id)
elif not repseq.sequence_file:
sequence_missing.append(g.id)
else:
successfully_mapped_counter += 1
log.info('{}/{}: number of genes with a representative sequence'.format(len(self.genes_with_a_representative_sequence),
len(self.genes)))
log.info('See the "df_representative_sequences" attribute for a summary dataframe.')
def get_sequence_properties(self, representatives_only=True):
"""Run Biopython ProteinAnalysis and EMBOSS pepstats to summarize basic statistics of all protein sequences.
Results are stored in the protein's respective SeqProp objects at ``.annotations``
Args:
representative_only (bool): If analysis should only be run on the representative sequences
"""
for g in tqdm(self.genes):
g.protein.get_sequence_properties(representative_only=representatives_only)
def blast_seqs_to_pdb(self, seq_ident_cutoff=0, evalue=0.0001, all_genes=False, display_link=False,
outdir=None, force_rerun=False):
"""BLAST each representative protein sequence to the PDB. Saves raw BLAST results (XML files).
Args:
seq_ident_cutoff (float, optional): Cutoff results based on percent coverage (in decimal form)
evalue (float, optional): Cutoff for the E-value - filters for significant hits. 0.001 is liberal,
0.0001 is stringent (default).
all_genes (bool): If all genes should be BLASTed, or only those without any structures currently mapped
display_link (bool, optional): Set to True if links to the HTML results should be displayed
outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
were not created initially
force_rerun (bool, optional): If existing BLAST results should not be used, set to True. Default is False
"""
counter = 0
for g in tqdm(self.genes_with_a_representative_sequence):
# If all_genes=False, BLAST only genes without a uniprot -> pdb mapping
if g.protein.num_structures_experimental > 0 and not all_genes and not force_rerun:
log.debug('{}: skipping BLAST, {} experimental structures already mapped '
'and all_genes flag is False'.format(g.id,
g.protein.num_structures_experimental))
continue
# BLAST the sequence to the PDB
new_pdbs = g.protein.blast_representative_sequence_to_pdb(seq_ident_cutoff=seq_ident_cutoff,
evalue=evalue,
display_link=display_link,
outdir=outdir,
force_rerun=force_rerun)
if new_pdbs:
counter += 1
log.debug('{}: {} PDBs BLASTed'.format(g.id, len(new_pdbs)))
else:
log.debug('{}: no BLAST results'.format(g.id))
log.info('Completed sequence --> PDB BLAST. See the "df_pdb_blast" attribute for a summary dataframe.')
log.info('{}: number of genes with additional structures added from BLAST'.format(counter))
def get_dssp_annotations(self, representatives_only=True, force_rerun=False):
"""Run DSSP on structures and store calculations.
Annotations are stored in the protein structure's chain sequence at:
``<chain_prop>.seq_record.letter_annotations['*-dssp']``
Args:
representative_only (bool): If analysis should only be run on the representative structure
force_rerun (bool): If calculations should be rerun even if an output file exists
"""
for g in tqdm(self.genes):
g.protein.get_dssp_annotations(representative_only=representatives_only, force_rerun=force_rerun)
def get_msms_annotations(self, representatives_only=True, force_rerun=False):
"""Run MSMS on structures and store calculations.
Annotations are stored in the protein structure's chain sequence at:
``<chain_prop>.seq_record.letter_annotations['*-msms']``
Args:
representative_only (bool): If analysis should only be run on the representative structure
force_rerun (bool): If calculations should be rerun even if an output file exists
"""
for g in tqdm(self.genes):
g.protein.get_msms_annotations(representative_only=representatives_only, force_rerun=force_rerun)
def find_disulfide_bridges(self, representatives_only=True):
"""Run Biopython's disulfide bridge finder and store found bridges.
Annotations are stored in the protein structure's chain sequence at:
``<chain_prop>.seq_record.annotations['SSBOND-biopython']``
Args:
representative_only (bool): If analysis should only be run on the representative structure
"""
for g in tqdm(self.genes):
g.protein.find_disulfide_bridges(representative_only=representatives_only)
### END STRUCTURE RELATED METHODS ###
####################################################################################################################
def download_progress(res, stream, save_name):
total_size = int(res.headers.get('content-length', 0));
chunk_size = 32 * 1024
pbar = tqdm(total=total_size, unit='B', unit_scale=True, desc=save_name)
for data in res.iter_content(chunk_size):
stream.write(data)
pbar.update(chunk_size)
def upload_data(gpu_ip, job_hash, data_path):
url = 'http://%s:%s/runJobDecorator' % (gpu_ip, settings.GPU_PORT)
file_size = path.getsize(data_path)
pbar = tqdm(total=file_size, unit='B', unit_scale=True)
def callback(monitor):
progress = monitor.bytes_read - callback.last_bytes_read
pbar.update(progress)
callback.last_bytes_read = monitor.bytes_read
callback.last_bytes_read = 0
with open(data_path, 'rb') as f:
data = {
'file': ('uploads.pkl', f, 'application/octet-stream'),
'hash': job_hash
}
encoder = MultipartEncoder(
fields=data
)
monitor = MultipartEncoderMonitor(encoder, callback)
r = requests.post(url, data=monitor, headers={
'Content-Type': monitor.content_type})
remove(data_path)
# pbar might not close when the user interrupts, need to fix this
pbar.close()
status_check(r)
def download_and_unzip_result(url, job_hash):
r = requests.get(url, stream=True)
status_check(r)
total_size = int(r.headers.get('content-length', 0))
with open('download.zip', 'wb') as f:
pbar = tqdm(total=total_size, unit='B', unit_scale=True)
chunk_size = 1024 * 32 # 32kb
for data in r.iter_content(chunk_size):
f.write(data)
pbar.update(chunk_size)
# again there might be a pbar issue here
pbar.close()
zip_content = open("download.zip", "rb").read()
z = ZipFile(io.BytesIO(zip_content))
z.extractall()
remove('download.zip')
result = None # output of the script
new_files = None # names of new files created by the script
pickle_path = path.abspath(path.join(job_hash, job_hash + '.pkl'))
if path.isfile(pickle_path):
with open(pickle_path, 'rb') as f:
# Hack: a workaround for dill's pickling problem
# import_all()
result = dill.load(f)
# unimport_all()
remove(pickle_path)
if path.isdir(job_hash):
new_files = listdir(job_hash)
for name in new_files:
rename(path.join(job_hash, name), name)
rmtree(job_hash)
return result, new_files
def run(self):
self.stream = self.sink.input_streams[0]
axes = self.stream.descriptor.axes
num_axes = len(axes)
totals = [self.stream.descriptor.num_points_through_axis(axis) for axis in range(num_axes)]
chunk_sizes = [max(1,self.stream.descriptor.num_points_through_axis(axis+1)) for axis in range(num_axes)]
self.num = min(self.num, num_axes)
self.bars = []
for i in range(self.num):
if self.notebook:
self.bars.append(tqdm_notebook(total=totals[i]/chunk_sizes[i]))
else:
self.bars.append(tqdm(total=totals[i]/chunk_sizes[i]))
self.w_id = 0
while True:
if self.stream.done() and self.w_id==self.stream.num_points():
break
new_data = np.array(await self.stream.queue.get()).flatten()
while self.stream.queue.qsize() > 0:
new_data = np.append(new_data, np.array(self.stream.queue.get_nowait()).flatten())
self.w_id += new_data.size
num_data = self.stream.points_taken
for i in range(self.num):
if num_data == 0:
if self.notebook:
self.bars[i].sp(close=True)
# Reset the progress bar with a new one
self.bars[i] = tqdm_notebook(total=totals[i]/chunk_sizes[i])
else:
# Reset the progress bar with a new one
self.bars[i].close()
self.bars[i] = tqdm(total=totals[i]/chunk_sizes[i])
pos = int(10*num_data / chunk_sizes[i])/10.0 # One decimal is good enough
if pos > self.bars[i].n:
self.bars[i].update(pos - self.bars[i].n)
num_data = num_data % chunk_sizes[i]
def _pbar(iterable, desc, leave=True, position=None, verbose='progressbar'):
if verbose == 'progressbar':
from mne.utils import ProgressBar
_ProgressBar = ProgressBar
if not version_is_greater_equal('mne', '0.14'):
class _ProgressBar(ProgressBar):
def __iter__(self):
"""Iterate to auto-increment the pbar with 1."""
self.max_value = len(iterable)
for obj in iterable:
yield obj
self.update_with_increment_value(1)
pbar = _ProgressBar(iterable, mesg=desc, spinner=True)
elif verbose == 'tqdm':
from tqdm import tqdm
pbar = tqdm(iterable, desc=desc, leave=leave, position=position,
dynamic_ncols=True)
elif verbose == 'tqdm_notebook':
from tqdm import tqdm_notebook
pbar = tqdm_notebook(iterable, desc=desc, leave=leave,
position=position, dynamic_ncols=True)
elif verbose is False:
pbar = iterable
return pbar
def augment(X, Y, n=5, elastic_alpha_range=20, rotation_range=30,
shift_x_range=0.1, shift_y_range=0.1):
"""I admit this code is very ugly (much worse than Keras ImageDataGenerator API
Note to myself : dont reinvent the wheel...
"""
X_transformed = np.zeros([X.shape[0] * n * 3] + list(X.shape[1:]))
Y_transformed = np.zeros([X.shape[0] * n * 3] + list(X.shape[1:]))
z = 0
for i in tnrange(n):
for j, (XX, YY) in tqdm_notebook(enumerate(zip(X, Y)), total=len(X), disable=False, leave=False):
X_transformed[z], Y_transformed[z] = elastic_transform([XX, YY], elastic_alpha_range, sigma=10)
z += 1
for j, (XX, YY) in tqdm_notebook(enumerate(zip(X, Y)), total=len(X), disable=False, leave=False):
X_transformed[z], Y_transformed[z] = random_rotation([XX, YY], rg=rotation_range,
row_index=1, col_index=2,
channel_index=0,
fill_mode='nearest', cval=0.)
z += 1
for j, (XX, YY) in tqdm_notebook(enumerate(zip(X, Y)), total=len(X), disable=False, leave=False):
X_transformed[z], Y_transformed[z] = random_shift([XX, YY], shift_x_range, shift_y_range,
row_index=1, col_index=2,
channel_index=0,
fill_mode='nearest', cval=0.)
z += 1
return X_transformed, Y_transformed
def verboserate(iterable, *args, **kwargs):
"""Iterate verbosely.
Args:
desc (str): prefix for the progress bar
total (int): total length of the iterable
See more options for tqdm.tqdm.
"""
progress = tqdm_notebook if in_ipython() else tqdm
for val in progress(iterable, *args, **kwargs):
yield val
def verboserate(iterable, *args, **kwargs):
"""Iterate verbosely.
Args:
desc (str): prefix for the progress bar
total (int): total length of the iterable
See more options for tqdm.tqdm.
"""
progress = tqdm_notebook if in_ipython() else tqdm
for val in progress(iterable, *args, **kwargs):
yield val
def on_epoch_begin(self, net, X=None, X_valid=None, **kwargs):
# Assume it is a number until proven otherwise.
batches_per_epoch = self.batches_per_epoch
if self.batches_per_epoch == 'auto':
batches_per_epoch = self._get_batches_per_epoch(net, X, X_valid)
elif self.batches_per_epoch == 'count':
# No limit is known until the end of the first epoch.
batches_per_epoch = None
if self._use_notebook():
self.pbar = tqdm.tqdm_notebook(total=batches_per_epoch)
else:
self.pbar = tqdm.tqdm(total=batches_per_epoch)
def kegg_mapping_and_metadata(self, kegg_organism_code, custom_gene_mapping=None, outdir=None,
set_as_representative=False, force_rerun=False):
"""Map all genes in the model to KEGG IDs using the KEGG service.
Steps:
1. Download all metadata and sequence files in the sequences directory
2. Creates a KEGGProp object in the protein.sequences attribute
3. Returns a Pandas DataFrame of mapping results
Args:
kegg_organism_code (str): The three letter KEGG code of your organism
custom_gene_mapping (dict): If your model genes differ from the gene IDs you want to map,
custom_gene_mapping allows you to input a dictionary which maps model gene IDs to new ones.
Dictionary keys must match model gene IDs.
outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
were not created initially
set_as_representative (bool): If mapped KEGG IDs should be set as representative sequences
force_rerun (bool): If you want to overwrite any existing mappings and files
"""
# First map all of the organism's KEGG genes to UniProt
kegg_to_uniprot = ssbio.databases.kegg.map_kegg_all_genes(organism_code=kegg_organism_code, target_db='uniprot')
successfully_mapped_counter = 0
for g in tqdm(self.genes):
if custom_gene_mapping:
kegg_g = custom_gene_mapping[g.id]
else:
kegg_g = g.id
# Download both FASTA and KEGG metadata files
kegg_prop = g.protein.load_kegg(kegg_id=kegg_g, kegg_organism_code=kegg_organism_code,
download=True, outdir=outdir, set_as_representative=set_as_representative,
force_rerun=force_rerun)
# Update potentially old UniProt ID
if kegg_g in kegg_to_uniprot.keys():
kegg_prop.uniprot = kegg_to_uniprot[kegg_g]
if g.protein.representative_sequence:
if g.protein.representative_sequence.kegg == kegg_prop.kegg:
g.protein.representative_sequence.uniprot = kegg_to_uniprot[kegg_g]
# Keep track of missing mappings - missing is defined by no available sequence
if kegg_prop.sequence_file:
successfully_mapped_counter += 1
log.debug('{}: loaded KEGG information for gene'.format(g.id))
log.info('{}/{}: number of genes mapped to KEGG'.format(successfully_mapped_counter, len(self.genes)))
log.info('Completed ID mapping --> KEGG. See the "df_kegg_metadata" attribute for a summary dataframe.')
def uniprot_mapping_and_metadata(self, model_gene_source, custom_gene_mapping=None, outdir=None,
set_as_representative=False, force_rerun=False):
"""Map all genes in the model to UniProt IDs using the UniProt mapping service.
Also download all metadata and sequences.
Args:
model_gene_source (str): the database source of your model gene IDs.
See: http://www.uniprot.org/help/api_idmapping
Common model gene sources are:
* Ensembl Genomes - ``ENSEMBLGENOME_ID`` (i.e. E. coli b-numbers)
* Entrez Gene (GeneID) - ``P_ENTREZGENEID``
* RefSeq Protein - ``P_REFSEQ_AC``
custom_gene_mapping (dict): If your model genes differ from the gene IDs you want to map,
custom_gene_mapping allows you to input a dictionary which maps model gene IDs to new ones.
Dictionary keys must match model genes.
outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
were not created initially
set_as_representative (bool): If mapped UniProt IDs should be set as representative sequences
force_rerun (bool): If you want to overwrite any existing mappings and files
"""
# Allow model gene --> custom ID mapping ({'TM_1012':'TM1012'})
if custom_gene_mapping:
genes_to_map = list(custom_gene_mapping.values())
else:
genes_to_map = [x.id for x in self.genes]
# Map all IDs first to available UniProts
genes_to_uniprots = bs_unip.mapping(fr=model_gene_source, to='ACC', query=genes_to_map)
successfully_mapped_counter = 0
for g in tqdm(self.genes):
if custom_gene_mapping and g.id in custom_gene_mapping.keys():
uniprot_gene = custom_gene_mapping[g.id]
else:
uniprot_gene = g.id
if uniprot_gene not in list(genes_to_uniprots.keys()):
log.debug('{}: unable to map to UniProt'.format(g.id))
else:
for mapped_uniprot in genes_to_uniprots[uniprot_gene]:
try:
uniprot_prop = g.protein.load_uniprot(uniprot_id=mapped_uniprot, download=True, outdir=outdir,
set_as_representative=set_as_representative,
force_rerun=force_rerun)
except HTTPError as e:
log.error('{}, {}: unable to complete web request'.format(g.id, mapped_uniprot))
print(e)
continue
if uniprot_prop.sequence_file or uniprot_prop.metadata_file:
successfully_mapped_counter += 1
log.info('{}/{}: number of genes mapped to UniProt'.format(successfully_mapped_counter, len(self.genes)))
log.info('Completed ID mapping --> UniProt. See the "df_uniprot_metadata" attribute for a summary dataframe.')
def get_scratch_predictions(self, path_to_scratch, results_dir, scratch_basename='scratch', num_cores=1,
exposed_buried_cutoff=25, custom_gene_mapping=None):
"""Run and parse ``SCRATCH`` results to predict secondary structure and solvent accessibility.
Annotations are stored in the protein's representative sequence at:
* ``.annotations``
* ``.letter_annotations``
Args:
path_to_scratch (str): Path to SCRATCH executable
results_dir (str): Path to SCRATCH results folder, which will have the files (scratch.ss, scratch.ss8,
scratch.acc, scratch.acc20)
scratch_basename (str): Basename of the SCRATCH results ('scratch' is default)
num_cores (int): Number of cores to use to parallelize SCRATCH run
exposed_buried_cutoff (int): Cutoff of exposed/buried for the acc20 predictions
custom_gene_mapping (dict): Default parsing of SCRATCH output files is to look for the model gene IDs. If
your output files contain IDs which differ from the model gene IDs, use this dictionary to map model
gene IDs to result file IDs. Dictionary keys must match model genes.
"""
if not self.genome_path:
# Write all sequences as one file
all_seqs = self.write_representative_sequences_file(outname=self.id)
# Runs SCRATCH or loads existing results in results_dir
scratch = SCRATCH(project_name=scratch_basename, seq_file=self.genome_path)
scratch.run_scratch(path_to_scratch=path_to_scratch, num_cores=num_cores, outdir=results_dir)
counter = 0
# Adding the scratch annotations to the representative_sequences letter_annotations
for g in tqdm(self.genes_with_a_representative_sequence):
if custom_gene_mapping:
g_id = custom_gene_mapping[g.id]
else:
g_id = g.id
if g_id in scratch.sspro_summary():
# Secondary structure
g.protein.representative_sequence.annotations.update(scratch.sspro_summary()[g_id])
g.protein.representative_sequence.annotations.update(scratch.sspro8_summary()[g_id])
g.protein.representative_sequence.letter_annotations['SS-sspro'] = scratch.sspro_results()[g_id]
g.protein.representative_sequence.letter_annotations['SS-sspro8'] = scratch.sspro8_results()[g_id]
# Solvent accessibility
g.protein.representative_sequence.annotations.update(scratch.accpro_summary()[g_id])
g.protein.representative_sequence.annotations.update(scratch.accpro20_summary(exposed_buried_cutoff)[g_id])
g.protein.representative_sequence.letter_annotations['RSA-accpro'] = scratch.accpro_results()[g_id]
g.protein.representative_sequence.letter_annotations['RSA-accpro20'] = scratch.accpro20_results()[g_id]
counter += 1
else:
log.error('{}: missing SCRATCH results'.format(g.id))
log.info('{}/{}: number of genes with SCRATCH predictions loaded'.format(counter, len(self.genes)))
def get_tmhmm_predictions(self, tmhmm_results, custom_gene_mapping=None):
"""Parse TMHMM results and store in the representative sequences.
This is a basic function to parse pre-run TMHMM results. Run TMHMM from the
web service (http://www.cbs.dtu.dk/services/TMHMM/) by doing the following:
1. Write all representative sequences in the GEM-PRO using the function ``write_representative_sequences_file``
2. Upload the file to http://www.cbs.dtu.dk/services/TMHMM/ and choose "Extensive, no graphics" as the output
3. Copy and paste the results (ignoring the top header and above "HELP with output formats") into a file and save it
4. Run this function on that file
Args:
tmhmm_results (str): Path to TMHMM results (long format)
custom_gene_mapping (dict): Default parsing of TMHMM output is to look for the model gene IDs. If
your output file contains IDs which differ from the model gene IDs, use this dictionary to map model
gene IDs to result file IDs. Dictionary keys must match model genes.
"""
# TODO: refactor to Protein class?
tmhmm_dict = ssbio.protein.sequence.properties.tmhmm.parse_tmhmm_long(tmhmm_results)
counter = 0
for g in tqdm(self.genes_with_a_representative_sequence):
if custom_gene_mapping:
g_id = custom_gene_mapping[g.id]
else:
g_id = g.id
if g_id in tmhmm_dict:
log.debug('{}: loading TMHMM results'.format(g.id))
if not tmhmm_dict[g_id]:
log.error("{}: missing TMHMM results".format(g.id))
g.protein.representative_sequence.annotations['num_tm_helix-tmhmm'] = tmhmm_dict[g_id]['num_tm_helices']
g.protein.representative_sequence.letter_annotations['TM-tmhmm'] = tmhmm_dict[g_id]['sequence']
counter += 1
else:
log.error("{}: missing TMHMM results".format(g.id))
log.info('{}/{}: number of genes with TMHMM predictions loaded'.format(counter, len(self.genes)))
### END SEQUENCE RELATED METHODS ###
####################################################################################################################
####################################################################################################################
### STRUCTURE RELATED METHODS ###
def get_itasser_models(self, homology_raw_dir, custom_itasser_name_mapping=None, outdir=None, force_rerun=False):
"""Copy generated I-TASSER models from a directory to the GEM-PRO directory.
Args:
homology_raw_dir (str): Root directory of I-TASSER folders.
custom_itasser_name_mapping (dict): Use this if your I-TASSER folder names differ from your model gene names.
Input a dict of {model_gene: ITASSER_folder}.
outdir (str): Path to output directory of downloaded files, must be set if GEM-PRO directories
were not created initially
force_rerun (bool): If homology files should be copied again even if they exist in the GEM-PRO directory
"""
counter = 0
for g in tqdm(self.genes):
if custom_itasser_name_mapping and g.id in custom_itasser_name_mapping:
hom_id = custom_itasser_name_mapping[g.id]
if not op.exists(op.join(homology_raw_dir, hom_id)):
hom_id = g.id
else:
hom_id = g.id
# The name of the actual pdb file will be $GENEID_model1.pdb
new_itasser_name = hom_id + '_model1'
orig_itasser_dir = op.join(homology_raw_dir, hom_id)
try:
itasser_prop = g.protein.load_itasser_folder(ident=hom_id, itasser_folder=orig_itasser_dir,
organize=True, outdir=outdir,
organize_name=new_itasser_name,
force_rerun=force_rerun)
except OSError:
log.debug('{}: homology model folder unavailable'.format(g.id))
continue
except IOError:
log.debug('{}: homology model unavailable'.format(g.id))
continue
if itasser_prop.structure_file:
counter += 1
else:
log.debug('{}: homology model file unavailable, perhaps modelling did not finish'.format(g.id))
log.info('Completed copying of {} I-TASSER models to GEM-PRO directory. See the "df_homology_models" attribute for a summary dataframe.'.format(counter))