def pad_nucleotide_sequences(aln_aa, seq_nuc):
'''
introduce gaps of 3 (---) into nucleotide sequences corresponding to aligned DNA sequences.
Parameters:
- aln_aa: amino acid alignment
- seq_nuc: unaligned nucleotide sequences.
Returns:
- aligned nucleotide sequences with all gaps length 3
'''
from Bio.Align import MultipleSeqAlignment
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
aln_nuc = MultipleSeqAlignment([])
for aa_seq in aln_aa:
try:
tmp_nuc_seq = str(seq_nuc[aa_seq.id].seq)
except KeyError as e:
print aa_seq.id
print 'Key not found, continue with next sequence'
continue
tmpseq = ''
nuc_pos = 0
for aa in aa_seq:
if aa=='-':
tmpseq+='---'
else:
tmpseq+=tmp_nuc_seq[nuc_pos:(nuc_pos+3)]
nuc_pos+=3
aln_nuc.append(SeqRecord(seq=Seq(tmpseq),id=aa_seq.id))
return aln_nuc
python类Seq()的实例源码
def add_translations(self):
'''
translate the nucleotide sequence into the proteins specified
in self.proteins. these are expected to be SeqFeatures
'''
from Bio import Seq
# Sort proteins by start position of the corresponding SeqFeature entry.
sorted_proteins = sorted(self.proteins.items(), key=lambda protein_pair: protein_pair[1].start)
for node in self.tree.find_clades(order='preorder'):
if not hasattr(node, "translations"):
# Maintain genomic order of protein translations for easy
# assembly by downstream functions.
node.translations=OrderedDict()
node.aa_mutations = {}
for prot, feature in sorted_proteins:
node.translations[prot] = Seq.translate(str(feature.extract(Seq.Seq("".join(node.sequence)))).replace('-', 'N'))
if node.up is None:
node.aa_mutations[prot] = []
else:
node.aa_mutations[prot] = [(a,pos,d) for pos, (a,d) in
enumerate(zip(node.up.translations[prot],
node.translations[prot])) if a!=d]
self.dump_attr.append('translations')
def __init__(self, logger, sequences, reference, dateFormat):
super(sequence_set, self).__init__()
self.log = logger
# load sequences from the (parsed) JSON - don't forget to sort out dates
self.seqs = {}
for name, data in sequences.iteritems():
self.seqs[name] = SeqRecord(Seq(data["seq"], generic_dna),
id=name, name=name, description=name)
self.seqs[name].attributes = data["attributes"]
# tidy up dates
date_struc = parse_date(self.seqs[name].attributes["raw_date"], dateFormat)
self.seqs[name].attributes["num_date"] = date_struc[1]
self.seqs[name].attributes["date"] = date_struc[2]
# if the reference is to be analysed it'll already be in the (filtered & subsampled)
# sequences, so no need to add it here, and no need to care about attributes etc
# we do, however, need it for alignment
self.reference_in_dataset = reference["included"]
name = reference["strain"]
self.reference_seq = SeqRecord(Seq(reference["seq"], generic_dna),
id=name, name=name, description=name)
if "genes" in reference and len(reference["genes"]):
self.proteins = {k:FeatureLocation(start=v["start"], end=v["end"], strand=v["strand"]) for k, v in reference["genes"].iteritems()}
else:
self.proteins = None
# other things:
self.run_dir = '_'.join(['temp', time.strftime('%Y%m%d-%H%M%S',time.gmtime()), str(random.randint(0,1000000))])
self.nthreads = 2 # should load from config file
def strip_non_reference(self):
'''
remove insertions relative to the reference from the alignment
'''
ungapped = np.array(self.reference_aln)!='-'
for seq in self.aln:
seq.seq = Seq("".join(np.array(seq)[ungapped]))
def make_gaps_ambiguous(self):
'''
replace all gaps by 'N' in all sequences in the alignment. TreeTime will treat them
as fully ambiguous and replace then with the most likely state
'''
for seq in self.aln:
seq_array = np.array(seq)
gaps = seq_array=='-'
seq_array[gaps]='N'
seq.seq = Seq("".join(seq_array))
def translate(self):
'''
make alignments of translations.
'''
from Bio.Seq import CodonTable
codon_table = CodonTable.ambiguous_dna_by_name['Standard'].forward_table
self.translations={}
if not hasattr(self, "proteins"): # ensure dictionary to hold annotation
self.proteins={}
# add a default translation of the entire sequence unless otherwise specified
if len(self.proteins)==0:
self.proteins.update({'cds':FeatureLocation(start=0,
end=self.aln.get_alignment_length(), strand=1)})
# loop over all proteins and create one MSA for each
for prot in self.proteins:
aa_seqs = []
for seq in self.aln:
tmpseq = self.proteins[prot].extract(seq)
translated_seq, translation_exception = safe_translate(str(tmpseq.seq), report_exceptions=True)
if translation_exception:
self.log.notify("Trouble translating because of invalid codons %s" % seq.id)
tmpseq.seq = Seq(translated_seq)
# copy attributes
tmpseq.attributes = seq.attributes
aa_seqs.append(tmpseq)
self.translations[prot] = MultipleSeqAlignment(aa_seqs)
def get_aa_seq(chain):
'''
Extract amino acid sequence from a PDB chain object and return sequence as
Bio.SeqRecord object.
'''
aa_list = []
residue_numbers = []
for residue in chain:
if is_aa(residue):
aa_list.append(SCOPData.protein_letters_3to1[residue.resname])
residue_numbers.append(str(residue.get_id()[1]) + \
residue.get_id()[2].strip())
aa_seq = SeqRecord(Seq(''.join(aa_list)), id='pdb_seq', description='')
return aa_seq, residue_numbers
def add_fixed_errors(input_iter, nr_errors, error_type):
"""Simulate sequencing errors for each SeqRecord object in the input iterator.
:param input_iter: Iterator of SeqRecord objects.
:para nr_errors: Number of errors to introduce.
:param error_type: Error type: substitution, insertion or deletion.
:returns: Generator of SeqRecord objects.
:rtype: generator
"""
for record in input_iter:
mutated_seq = sim_seq.add_errors(record.seq, nr_errors, error_type)
record.seq = Seq(mutated_seq)
yield record
def simulate_errors(input_iter, error_rate, error_weights):
"""Simulate sequencing errors for each SeqRecord object in the input iterator.
:param input_iter: Iterator of SeqRecord objects.
:para error_rate: Total error rate of substitutions, insertions and deletions.
:param error_weights: Relative frequency of substitutions,insertions,deletions.
:returns: Generator of SeqRecord objects.
:rtype: generator
"""
for record in input_iter:
mutated_seq = sim_seq.simulate_sequencing_errors(record.seq, error_rate, error_weights).seq
record.seq = Seq(mutated_seq)
yield record
def build_record(id, classification):
description = (classification['classifier']
+ '|' + str(classification['taxid'])
+ '|' + classification['sciname']
+ '|' + classification['rank']
+ '|' + ';'.join(classification['lineage']))
record = SeqRecord(Seq(classification['sequence'], IUPAC.ambiguous_dna),
id=id, description=description)
return record
def digest(fasta_records, enzyme):
"""
Divide a genome into restriction fragments.
Parameters
----------
fasta_records : OrderedDict
Dictionary of chromosome names to sequence records.
enzyme: str
Name of restriction enzyme.
Returns
-------
Dataframe with columns: 'chrom', 'start', 'end'.
"""
import Bio.Restriction as biorst
import Bio.Seq as bioseq
# http://biopython.org/DIST/docs/cookbook/Restriction.html#mozTocId447698
chroms = fasta_records.keys()
try:
cut_finder = getattr(biorst, enzyme).search
except AttributeError:
raise ValueError('Unknown enzyme name: {}'.format(enzyme))
def _each(chrom):
seq = bioseq.Seq(str(fasta_records[chrom]))
cuts = np.r_[0, np.array(cut_finder(seq)) + 1, len(seq)].astype(int)
n_frags = len(cuts) - 1
frags = pd.DataFrame({
'chrom': [chrom] * n_frags,
'start': cuts[:-1],
'end': cuts[1:]},
columns=['chrom', 'start', 'end'])
return frags
return pd.concat(map(_each, chroms), axis=0, ignore_index=True)
def sequence(self):
"""
Returns the nucleotide sequence of self
"""
if self.location.strand > 0:
return SeqFeature(location = self.location).extract(self.seq)
seq = Seq("")
for part in reversed(self.location.parts):
seq += SeqFeature(location = part).extract(self.seq)
return seq
def add_str(self, seq, name=None, description=""):
self.add_seq(SeqRecord(Seq(seq), id=name, description=description))
def reduce_exons(iterable_of_seqrecords):
"""Reduce the exons by sequence identity
Build a dict whose keys are the sequence in str format and the values are
lists in which the first element is the exon_id and the remaining are
the coordinates in loci:start-end format
"""
# Initialize
seq_to_data = dict()
# Go over each record
for record in iterable_of_seqrecords:
seq = str(record.seq)
if seq in seq_to_data: # Just append
seq_to_data[seq].append(record.description)
else: # Enter values in both dicts
seq_to_data[seq] = [
'EXON{:011d}'.format(len(seq_to_data) + 1),
record.description
]
# Collect data from both dicts into a SeqRecord and return
for seq in seq_to_data.keys():
identifier, *description = seq_to_data[seq]
record = SeqRecord(
id=identifier,
description=" ".join(description),
seq=Seq(seq)
)
yield record
def _segments_to_exon_dict(segments):
"""(list of lists) -> dict
Conver a list of ["S", "ident", "seq", *whatever] to a dict {ident: SeqRecord}
"""
exon_dict = {}
for segment in segments:
exon_id, sequence = segment[1:3]
exon_dict[exon_id] = SeqRecord(
id=exon_id,
description="",
seq=Seq(sequence)
)
return exon_dict
def _compose_paths(exon_dict, path_dict, number_of_ns):
ns = "N" * number_of_ns
for transcript_id, exon_list in sorted(path_dict.items()):
exon_seqs = [str(exon_dict[exon_id].seq) for exon_id in exon_list]
yield SeqRecord(
id=transcript_id,
description=",".join(exon_list),
seq=Seq(ns.join(exon_seqs))
)
def COMPL_STRING(line):
my_seq = Seq(line)
compl_line=str(my_seq.reverse_complement())
return compl_line
def gen_sequence(comp, length) :
seq_string = ''
##### Part 3 : Your random sequence generating code goes here
# Assertion is advised
assert abs( sum(comp.values())-1 ) < 0.01 , 'Probabilities do not add up to 1.'
# We generate a sequence of given length
for i in range(length) :
# Get a random number in [0,1)
dice = random()
limit=0
# We divide [0,1) interval according to probabilities of each nucleotide
for nuc in comp :
limit += comp[nuc]
# We add the letter that dice hits
if dice<limit :
seq_string += nuc
limit = 0
# Roll another dice for the next nucleotide
break
#####
sequence = Seq(seq_string)
return SeqRecord(sequence, id='Random Sequence', description=comp.__repr__())
##### ALL MODIFICATIONS GO ABOVE THIS LINE #####
### Part 0 : Argument Parsing
### We want out program to have easy-to-use parameters
### We are using the argparse library for this
def gen_sequence(comp, length) :
seq_string = ''
##### Part 3 : Your random sequence generating code goes here
##### Goal : Fill in seq_string with a random sequence of given composition
sequence = Seq(seq_string)
return SeqRecord(sequence, id='Random Sequence', description=comp.__repr__())
##### ALL MODIFICATIONS GO ABOVE THIS LINE #####
### Part 0 : Argument Parsing
### We want out program to have easy-to-use parameters
### We are using the argparse library for this
def cds_seq(self):
"""Lookup transcript, exon and UTR info from Ensembl Rest API."""
if not self._cds_seq:
seq_info = client.perform_rest_action(
'/sequence/id/{0}'.format(self.transcript_id),
params={'type': 'cds'})
seq_str = seq_info['seq']
if seq_str.startswith('N') or seq_str.endswith('N'):
seq_str = seq_str.strip('N')
seq = Seq(seq_str, IUPAC.unambiguous_dna)
self._cds_seq = seq
self._aa_seq = seq.translate()
else:
seq = self._cds_seq
return seq