python类Seq()的实例源码

seq_util.py 文件源码 项目:augur 作者: nextstrain 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def pad_nucleotide_sequences(aln_aa, seq_nuc):
    '''
    introduce gaps of 3 (---) into nucleotide sequences corresponding to aligned DNA sequences.

    Parameters:
    - aln_aa: amino acid alignment
    - seq_nuc: unaligned nucleotide sequences.

    Returns:
    - aligned nucleotide sequences with all gaps length 3
    '''
    from Bio.Align import MultipleSeqAlignment
    from Bio.SeqRecord import SeqRecord
    from Bio.Seq import Seq
    aln_nuc = MultipleSeqAlignment([])
    for aa_seq  in aln_aa:
        try:
            tmp_nuc_seq = str(seq_nuc[aa_seq.id].seq)
        except KeyError as e:
            print aa_seq.id
            print 'Key not found, continue with next sequence'
            continue

        tmpseq = ''
        nuc_pos = 0
        for aa in aa_seq:
            if aa=='-':
                tmpseq+='---'
            else:
                tmpseq+=tmp_nuc_seq[nuc_pos:(nuc_pos+3)]
                nuc_pos+=3

        aln_nuc.append(SeqRecord(seq=Seq(tmpseq),id=aa_seq.id))

    return aln_nuc
tree.py 文件源码 项目:augur 作者: nextstrain 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def add_translations(self):
        '''
        translate the nucleotide sequence into the proteins specified
        in self.proteins. these are expected to be SeqFeatures
        '''
        from Bio import Seq

        # Sort proteins by start position of the corresponding SeqFeature entry.
        sorted_proteins = sorted(self.proteins.items(), key=lambda protein_pair: protein_pair[1].start)

        for node in self.tree.find_clades(order='preorder'):
            if not hasattr(node, "translations"):
                # Maintain genomic order of protein translations for easy
                # assembly by downstream functions.
                node.translations=OrderedDict()
                node.aa_mutations = {}

            for prot, feature in sorted_proteins:
                node.translations[prot] = Seq.translate(str(feature.extract(Seq.Seq("".join(node.sequence)))).replace('-', 'N'))

                if node.up is None:
                    node.aa_mutations[prot] = []
                else:
                    node.aa_mutations[prot] = [(a,pos,d) for pos, (a,d) in
                                               enumerate(zip(node.up.translations[prot],
                                                             node.translations[prot])) if a!=d]

        self.dump_attr.append('translations')
sequences_process.py 文件源码 项目:augur 作者: nextstrain 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def __init__(self, logger, sequences, reference, dateFormat):
        super(sequence_set, self).__init__()
        self.log = logger

        # load sequences from the (parsed) JSON - don't forget to sort out dates
        self.seqs = {}
        for name, data in sequences.iteritems():
            self.seqs[name] = SeqRecord(Seq(data["seq"], generic_dna),
                   id=name, name=name, description=name)
            self.seqs[name].attributes = data["attributes"]
            # tidy up dates
            date_struc = parse_date(self.seqs[name].attributes["raw_date"], dateFormat)
            self.seqs[name].attributes["num_date"] = date_struc[1]
            self.seqs[name].attributes["date"] = date_struc[2]

        # if the reference is to be analysed it'll already be in the (filtered & subsampled)
        # sequences, so no need to add it here, and no need to care about attributes etc
        # we do, however, need it for alignment
        self.reference_in_dataset = reference["included"]
        name = reference["strain"]
        self.reference_seq = SeqRecord(Seq(reference["seq"], generic_dna),
               id=name, name=name, description=name)
        if "genes" in reference and len(reference["genes"]):
            self.proteins = {k:FeatureLocation(start=v["start"], end=v["end"], strand=v["strand"]) for k, v in reference["genes"].iteritems()}
        else:
            self.proteins = None

        # other things:
        self.run_dir = '_'.join(['temp', time.strftime('%Y%m%d-%H%M%S',time.gmtime()), str(random.randint(0,1000000))])
        self.nthreads = 2 # should load from config file
sequences_process.py 文件源码 项目:augur 作者: nextstrain 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def strip_non_reference(self):
        '''
        remove insertions relative to the reference from the alignment
        '''
        ungapped = np.array(self.reference_aln)!='-'
        for seq in self.aln:
            seq.seq = Seq("".join(np.array(seq)[ungapped]))
sequences_process.py 文件源码 项目:augur 作者: nextstrain 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def make_gaps_ambiguous(self):
        '''
        replace all gaps by 'N' in all sequences in the alignment. TreeTime will treat them
        as fully ambiguous and replace then with the most likely state
        '''
        for seq in self.aln:
            seq_array = np.array(seq)
            gaps = seq_array=='-'
            seq_array[gaps]='N'
            seq.seq = Seq("".join(seq_array))
sequences_process.py 文件源码 项目:augur 作者: nextstrain 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def translate(self):
        '''
        make alignments of translations.
        '''
        from Bio.Seq import CodonTable
        codon_table  = CodonTable.ambiguous_dna_by_name['Standard'].forward_table
        self.translations={}
        if not hasattr(self, "proteins"): # ensure dictionary to hold annotation
            self.proteins={}

        # add a default translation of the entire sequence unless otherwise specified
        if len(self.proteins)==0:
            self.proteins.update({'cds':FeatureLocation(start=0,
                end=self.aln.get_alignment_length(), strand=1)})

        # loop over all proteins and create one MSA for each
        for prot in self.proteins:
            aa_seqs = []
            for seq in self.aln:
                tmpseq = self.proteins[prot].extract(seq)
                translated_seq, translation_exception = safe_translate(str(tmpseq.seq), report_exceptions=True)
                if translation_exception:
                    self.log.notify("Trouble translating because of invalid codons %s" % seq.id)

                tmpseq.seq = Seq(translated_seq)

                # copy attributes
                tmpseq.attributes = seq.attributes
                aa_seqs.append(tmpseq)
            self.translations[prot] = MultipleSeqAlignment(aa_seqs)
make_map.py 文件源码 项目:proteinER 作者: clauswilke 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def get_aa_seq(chain):
    '''
    Extract amino acid sequence from a PDB chain object and return sequence as
    Bio.SeqRecord object.
    '''
    aa_list = []
    residue_numbers = []
    for residue in chain:
        if is_aa(residue):
            aa_list.append(SCOPData.protein_letters_3to1[residue.resname])
            residue_numbers.append(str(residue.get_id()[1]) + \
                                   residue.get_id()[2].strip())
    aa_seq = SeqRecord(Seq(''.join(aa_list)), id='pdb_seq', description='')
    return aa_seq, residue_numbers
add_errors.py 文件源码 项目:wub 作者: nanoporetech 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def add_fixed_errors(input_iter, nr_errors, error_type):
    """Simulate sequencing errors for each SeqRecord object in the input iterator.

    :param input_iter: Iterator of SeqRecord objects.
    :para nr_errors: Number of errors to introduce.
    :param error_type: Error type: substitution, insertion or deletion.
    :returns: Generator of SeqRecord objects.
    :rtype: generator
    """
    for record in input_iter:
        mutated_seq = sim_seq.add_errors(record.seq, nr_errors, error_type)
        record.seq = Seq(mutated_seq)
        yield record
simulate_errors.py 文件源码 项目:wub 作者: nanoporetech 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def simulate_errors(input_iter, error_rate, error_weights):
    """Simulate sequencing errors for each SeqRecord object in the input iterator.

    :param input_iter: Iterator of SeqRecord objects.
    :para error_rate: Total error rate of substitutions, insertions and deletions.
    :param error_weights: Relative frequency of substitutions,insertions,deletions.
    :returns: Generator of SeqRecord objects.
    :rtype: generator
    """
    for record in input_iter:
        mutated_seq = sim_seq.simulate_sequencing_errors(record.seq, error_rate, error_weights).seq
        record.seq = Seq(mutated_seq)
        yield record
tictax.py 文件源码 项目:tictax 作者: bede 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def build_record(id, classification):
    description = (classification['classifier']
                   + '|' + str(classification['taxid'])
                   + '|' + classification['sciname']
                   + '|' + classification['rank']
                   + '|' + ';'.join(classification['lineage']))
    record = SeqRecord(Seq(classification['sequence'], IUPAC.ambiguous_dna),
                       id=id, description=description)
    return record
tools.py 文件源码 项目:bioframe 作者: mirnylab 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def digest(fasta_records, enzyme):
    """
    Divide a genome into restriction fragments.
    Parameters
    ----------
    fasta_records : OrderedDict
        Dictionary of chromosome names to sequence records.
    enzyme: str
        Name of restriction enzyme.
    Returns
    -------
    Dataframe with columns: 'chrom', 'start', 'end'.
    """
    import Bio.Restriction as biorst
    import Bio.Seq as bioseq
    # http://biopython.org/DIST/docs/cookbook/Restriction.html#mozTocId447698
    chroms = fasta_records.keys()
    try:
        cut_finder = getattr(biorst, enzyme).search
    except AttributeError:
        raise ValueError('Unknown enzyme name: {}'.format(enzyme))

    def _each(chrom):
        seq = bioseq.Seq(str(fasta_records[chrom]))
        cuts = np.r_[0, np.array(cut_finder(seq)) + 1, len(seq)].astype(int)
        n_frags = len(cuts) - 1

        frags = pd.DataFrame({
            'chrom': [chrom] * n_frags,
            'start': cuts[:-1],
            'end': cuts[1:]},
            columns=['chrom', 'start', 'end'])
        return frags

    return pd.concat(map(_each, chroms), axis=0, ignore_index=True)
feature.py 文件源码 项目:EMBLmyGFF3 作者: NBISweden 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def sequence(self):
        """
        Returns the nucleotide sequence of self
        """
        if self.location.strand > 0:
            return SeqFeature(location = self.location).extract(self.seq)

        seq = Seq("")
        for part in reversed(self.location.parts):
            seq += SeqFeature(location = part).extract(self.seq)

        return seq
__init__.py 文件源码 项目:seqenv 作者: xapple 项目源码 文件源码 阅读 15 收藏 0 点赞 0 评论 0
def add_str(self, seq, name=None, description=""):
        self.add_seq(SeqRecord(Seq(seq), id=name, description=description))
reduce_exons.py 文件源码 项目:exfi 作者: jlanga 项目源码 文件源码 阅读 17 收藏 0 点赞 0 评论 0
def reduce_exons(iterable_of_seqrecords):
    """Reduce the exons by sequence identity

    Build a dict whose keys are the sequence in str format and the values are
    lists in which the first element is the exon_id and the remaining are
    the coordinates in loci:start-end format
    """
    # Initialize
    seq_to_data = dict()

    # Go over each record
    for record in iterable_of_seqrecords:
        seq = str(record.seq)
        if seq in seq_to_data:  # Just append
            seq_to_data[seq].append(record.description)
        else:  # Enter values in both dicts
            seq_to_data[seq] = [
                'EXON{:011d}'.format(len(seq_to_data) + 1),
                record.description
            ]

    # Collect data from both dicts into a SeqRecord and return
    for seq in seq_to_data.keys():
        identifier, *description = seq_to_data[seq]
        record = SeqRecord(
            id=identifier,
            description=" ".join(description),
            seq=Seq(seq)
        )
        yield record
io.py 文件源码 项目:exfi 作者: jlanga 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def _segments_to_exon_dict(segments):
    """(list of lists) -> dict

    Conver a list of ["S", "ident", "seq", *whatever] to a dict {ident: SeqRecord}
    """
    exon_dict = {}
    for segment in segments:
        exon_id, sequence = segment[1:3]
        exon_dict[exon_id] = SeqRecord(
            id=exon_id,
            description="",
            seq=Seq(sequence)
        )
    return exon_dict
io.py 文件源码 项目:exfi 作者: jlanga 项目源码 文件源码 阅读 45 收藏 0 点赞 0 评论 0
def _compose_paths(exon_dict, path_dict, number_of_ns):
    ns = "N" * number_of_ns
    for transcript_id, exon_list in sorted(path_dict.items()):
        exon_seqs = [str(exon_dict[exon_id].seq) for exon_id in exon_list]
        yield SeqRecord(
            id=transcript_id,
            description=",".join(exon_list),
            seq=Seq(ns.join(exon_seqs))
        )
general.py 文件源码 项目:NucDiff 作者: uio-cels 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def COMPL_STRING(line):

    my_seq = Seq(line)
    compl_line=str(my_seq.reverse_complement())

    return compl_line
randomSeq-solution.py 文件源码 项目:FMAB 作者: BrendelGroup 项目源码 文件源码 阅读 16 收藏 0 点赞 0 评论 0
def gen_sequence(comp, length) :
    seq_string = ''

    ##### Part 3 : Your random sequence generating code goes here

    # Assertion is advised
    assert abs( sum(comp.values())-1 ) < 0.01 , 'Probabilities do not add up to 1.'

    # We generate a sequence of given length
    for i in range(length) :
        # Get a random number in [0,1)
        dice = random()
        limit=0
        # We divide [0,1) interval according to probabilities of each nucleotide
        for nuc in comp :
            limit += comp[nuc]
            # We add the letter that dice hits
            if dice<limit :
                seq_string += nuc
                limit = 0
                # Roll another dice for the next nucleotide
                break

    #####

    sequence = Seq(seq_string)
    return SeqRecord(sequence, id='Random Sequence', description=comp.__repr__())

##### ALL MODIFICATIONS GO ABOVE THIS LINE #####

### Part 0 : Argument Parsing
### We want out program to have easy-to-use parameters
### We are using the argparse library for this
randomSeq.py 文件源码 项目:FMAB 作者: BrendelGroup 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def gen_sequence(comp, length) :
    seq_string = ''

##### Part 3 : Your random sequence generating code goes here
##### Goal   : Fill in seq_string with a random sequence of given composition

    sequence = Seq(seq_string)
    return SeqRecord(sequence, id='Random Sequence', description=comp.__repr__())

##### ALL MODIFICATIONS GO ABOVE THIS LINE #####

### Part 0 : Argument Parsing
### We want out program to have easy-to-use parameters
### We are using the argparse library for this
gene_info.py 文件源码 项目:gene_info 作者: sggaffney 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def cds_seq(self):
        """Lookup transcript, exon and UTR info from Ensembl Rest API."""
        if not self._cds_seq:
            seq_info = client.perform_rest_action(
                '/sequence/id/{0}'.format(self.transcript_id),
                params={'type': 'cds'})
            seq_str = seq_info['seq']
            if seq_str.startswith('N') or seq_str.endswith('N'):
                seq_str = seq_str.strip('N')
            seq = Seq(seq_str, IUPAC.unambiguous_dna)
            self._cds_seq = seq
            self._aa_seq = seq.translate()
        else:
            seq = self._cds_seq
        return seq


问题


面经


文章

微信
公众号

扫码关注公众号