python类FastaFile()的实例源码

load.py 文件源码 项目:OrbWeaver 作者: rajanil 项目源码 文件源码 阅读 41 收藏 0 点赞 0 评论 0
def __init__(self, fasta_filename, prediction_type, cellgroup_mapping=None):

        self._genome_handle = pysam.FastaFile(fasta_filename)
        self.prediction_type = prediction_type

        if cellgroup_mapping is not None:
            self.cellgroup_mapping = cellgroup_mapping
            self.C = len(self.cellgroup_mapping)
file_parse.py 文件源码 项目:CSI 作者: YangLab 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def check_fasta(fa_f, pysam_flag=True):
    if not os.path.isfile(fa_f + '.fai'):
        pysam.faidx(fa_f)
    if pysam_flag:  # return pysam FastaFile object
        fa = pysam.FastaFile(fa_f)
        return fa
    else:  # return fasta file path
        return fa_f
baseline.py 文件源码 项目:DREAM_invivo_tf_binding_prediction_challenge_baseline 作者: nboley 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def iter_seqs(self, fasta_fname):
        genome = pysam.FastaFile(fasta_fname)
        return (
            genome.fetch(contig, start, stop+1).upper()
            for contig, start, stop
            in self.iter_regions(flank_size=400)
        )
tview.py 文件源码 项目:medaka 作者: nanoporetech 项目源码 文件源码 阅读 19 收藏 0 点赞 0 评论 0
def __init__(self, bam:str, reference:str, columns:int=20000, cache:int=None, minor_gaps:bool=False):
        """Interface to samtools tview functionality.

        :param bam: path to bam file of reads aligned to a common
            reference.
        :param reference: path to fasta file of reference to which reads
            are aligned.
        :param columns: default number of tview columns to read in one go.
        :param cache: Currently not implemented.
        :param minor_gaps: encoded gaps in non-reference positions distinctly.
        """
        self.logger = logging.getLogger('TView')
        self._bam = bam
        self._reference = reference
        self._columns = columns
        self._cache = cache
        self._minor_gaps = minor_gaps
        if self._minor_gaps:
            raise NotImplementedError
        self._columns_per_base = 3 # reasonable estimate
        self._column_pad = 1.1 # multiplier when _columns_per_base is updated

        # crosscheck the inputs
        with pysam.AlignmentFile(bam, 'rb') as b:
            brefs = b.references
            blens = b.lengths
        rrefs = set(pysam.FastaFile(reference).references)
        sbrefs = set(brefs)
        if not sbrefs.issuperset(rrefs):
            raise ValueError("input bam and reference do not appear consistent: {} vs {}.".format(sbrefs, rrefs))
        self._refs = set(brefs)
        self._reflens = dict(zip(brefs, blens))

        self._pileup = None # buffered `Pileup` object
util.py 文件源码 项目:gretel 作者: SamStudio8 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def load_fasta(fa_path):
    """
    A convenient wrapper function for constructing a :py:class:`pysam.FastaFile`

    Parameters
    ----------
    fa_path : str
        Path to FASTA

    Returns
    -------

    FASTA File Interface : :py:class:`pysam.FastaFile`
    """
    return pysam.FastaFile(fa_path)
reftools.py 文件源码 项目:XYalign 作者: WilsonSayresLab 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def get_chrom_length(self, chrom):
        """
        Extract chromosome length from fasta.

        Parameters
        ----------

        chrom : str
            The name of the chromosome or scaffold.

        Returns
        -------

        length : int
            The length (integer) of the chromsome/scaffold

        Raises
        ------

        RuntimeError
            If chromosome name not present in bam header

        """
        fastafile = pysam.FastaFile(self.filepath)
        lengths = dict(zip(fastafile.references, fastafile.lengths))
        try:
            lens = lengths[chrom]
            fastafile.close()
            return lens
        except:
            self.logger.error(
                "{} not present in {}. Exiting.".format(
                    chrom, self.filepath))
            logging.shutdown()
            raise RuntimeError(
                "Chromosome name not present in fasta. Exiting")
reftools.py 文件源码 项目:XYalign 作者: WilsonSayresLab 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def chromosome_lengths(self):
        """
        Returns
        -------

        tuple
            Chromosome lengths ordered by sequence order in fasta

        """
        fastafile = pysam.FastaFile(self.filepath)
        lengths = fastafile.lengths
        fastafile.close()
        # pysam's .lengths does not return a tuple (despite what is in the docs),
        # so, convert to tuple before returning.
        return tuple(lengths)
reftools.py 文件源码 项目:XYalign 作者: WilsonSayresLab 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def chromosome_names(self):
        """
        Returns
        -------

        tuple
            Chromosome names ordered by sequence order in fasta

        """
        fastafile = pysam.FastaFile(self.filepath)
        names = fastafile.references
        fastafile.close()
        # pysam's .lengths does not return a tuple (despite what is in the docs),
        # so, convert to tuple before returning.
        return tuple(names)
fabgz.py 文件源码 项目:biocommons.seqrepo 作者: biocommons 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def __init__(self, filename):
        self._fh = FastaFile(filename)
fabgz.py 文件源码 项目:biocommons.seqrepo 作者: biocommons 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def close(self):
        if self._fh:
            self._fh.close()
            self._fh = None
            subprocess.check_call([bgzip_exe, "--force", self._basepath])
            os.rename(self._basepath + ".gz", self.filename)

            # open file with FastaFile to create indexes, then make all read-only
            _fh = FastaFile(self.filename)
            _fh.close()
            os.chmod(self.filename, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".fai", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
            os.chmod(self.filename + ".gzi", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)

            logger.info("{} written; added {} sequences".format(self.filename, len(self._added)))
plot.py 文件源码 项目:shabam 作者: dlrice 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def covplot(seqfiles, chrom, start, end, fastafile, out=None):
    '''
    '''
    if type(seqfiles) is not list:
        seqfiles = [seqfiles]

    chrom = str(chrom)
    with pysam.FastaFile(fastafile) as handle:
        reference = handle.fetch(start=start, end=end, region=chrom)
        ref = reference

    axis_offset = 75
    cov_height = 200
    height = len(seqfiles) * cov_height + (len(seqfiles) - 1 ) * 50 + axis_offset

    out_type, surface = fileformat(out, width=(end - start) * 10, height=height)
    context = cairo.Context(surface)

    plot_axis(context, start, end, axis_offset - 10)
    plot_grid(context, start, end, axis_offset, height)

    for seqfile in seqfiles:
        seq = pysam.AlignmentFile(seqfile, 'rb')
        coords = OrderedDict({0: -1e9})
        reps = ( parse_read(x, coords, ref, start) for x in seq.fetch(chrom, start, end) )

        plot_coverage(context, get_coverage(reps), axis_offset, start, (end - start) * 10, cov_height)
        axis_offset += cov_height

        if seqfiles.index(seqfile) < len(seqfiles) - 1:
            insert_spacer(context, coords, start, end)
            axis_offset += 50

    context.save()

    if out_type == 'png':
        surface.write_to_png(out)
    elif out_type is None:
        return surface.write_to_png()
bias_utils.py 文件源码 项目:brie 作者: huangyh09 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def __init__(self, fasta_file):
        self.f = pysam.FastaFile(fasta_file)
fasta_utils.py 文件源码 项目:brie 作者: huangyh09 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def __init__(self, fasta_file):
        self.f = pysam.FastaFile(fasta_file)
tview.py 文件源码 项目:medaka 作者: nanoporetech 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def bam_pileup_to_hdf(hdf, bams, ref_fasta, ref_name:str=None, start:int=0, end:int=None, truth_bam=None,
                      chunk_len=50000, overlap=1000):
    """Create an .hdf file containing chunks of a pileup.

    :param hdf: output .hdf file (will be overwritten).
    :param bams: iterable of input .bam files.
    :param ref_fasta: input .fasta file.
    :param ref_name: name of reference within .bam to parse.
    :param start: start reference coordinate.
    :param end: end reference coordinate. If `None` the full extent of
        the reference will be parsed.
    :param truth_bam: .bam file of truth aligned to ref to generate labels.
    :param chunk_len: int, chunk length in reference coordinates.
    :param overlap: int, overlap of adjacent chunks in reference coordinates.


    ..note:: Datasets will be name as `{reference}_{start}_{end}`, indicating
        the reference coordinates spanned.

    """
    if ref_name is None:
        # use all references
        refs = pysam.FastaFile(ref_fasta).references
        assert end == None
        assert start == 0
    else:
        refs = [ref_name,]

    with h5py.File(hdf, 'w') as hdf:
        for ref in refs:
            if truth_bam is not None:
                gen = generate_training_chunks(truth_bam, bams, ref_fasta, ref_name=ref,
                                               start=start, end=end, chunk_len=chunk_len, overlap=overlap)
            else:
                gen = generate_pileup_chunks(bams, ref_fasta, ref_name=ref, start=start, end=end,
                                             chunk_len=chunk_len, overlap=overlap)
            logger.info("Processing reference {}.".format(ref))
            for c in gen:
                pos = c.pileups[0].positions
                chunk_str = '{}_{}_{}'.format(ref, pos['major'][0], pos['major'][-1] + 1)
                if c.labels is not None:
                    # save labels
                    hdf['{}/labels'.format(chunk_str)] = np.char.encode(c.labels)
                if c.ref_seq is not None:
                    # save ref
                    hdf['{}/ref_seq'.format(chunk_str)] = np.char.encode(c.ref_seq)

                for bam_count, p in enumerate(c.pileups):
                    assert p.ref_name == ref
                    grp = '{}/datatype{}'.format(chunk_str, bam_count)
                    hdf['{}/positions'.format(grp)] = p.positions
                    hdf[grp].attrs['rname'] = p.ref_name
                    hdf[grp].attrs['start'] = p.positions['major'][0]
                    hdf[grp].attrs['end'] = p.positions['major'][-1]
                    hdf[grp].attrs['bam'] = p.bam
                    hdf['{}/data'.format(grp)] = p.reads
plot.py 文件源码 项目:shabam 作者: dlrice 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def seqplot(seqfiles, chrom, start, end, fastafile, out=None, by_strand=False):
    ''' the plotting function

    Args:
        seqfiles: list of paths to sequence files
        chrom: chromosome to fetch reads from
        start: start nucleotide of plotting window.
        end: end nucleotide of plotting window.
        fastafile: path to reference FASTA file.
        out: path to write image file to, or None to return bytes-encoded png
        by_strand: whether to shade reads by strand

    Returns:
        None, or if out is None, returns image plot as bytes-encoded png
    '''

    if type(seqfiles) is not list:
        seqfiles = [seqfiles]

    chrom = str(chrom)
    with pysam.FastaFile(fastafile) as handle:
        reference = handle.fetch(start=start, end=end, region=chrom)
        ref = reference

    axis_offset = 75
    height = get_height(seqfiles, chrom, start, end, axis_offset)

    out_type, surface = fileformat(out, width=(end - start) * 10, height=height)
    context = cairo.Context(surface)

    depths = [axis_offset]
    for seqfile in seqfiles:
        seq = pysam.AlignmentFile(seqfile, 'rb')
        coords = OrderedDict({max(depths): -1e9})
        reps = ( parse_read(x, coords, ref, start) for x in seq.fetch(chrom, start, end) )

        _plot(context, reps, start, end, axis_offset, height, reference, by_strand)
        reference = None # don't plot the reference in subsequent BAMs

        if seqfiles.index(seqfile) < len(seqfiles) - 1:
            insert_spacer(context, coords, start, end)

        depths.append(max(coords))

    context.save()

    if out_type == 'png':
        surface.write_to_png(out)
    elif out_type is None:
        return surface.write_to_png()
test_position_permutation_test.py 文件源码 项目:probabilistic2020 作者: KarchinLab 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def test_ctnnb1_get_aa_mut_info():
    import pysam
    from prob2020.python.gene_sequence import GeneSequence

    # read fasta
    ctnnb1_fasta = os.path.join(file_dir, 'data/CTNNB1.fa')
    gene_fa = pysam.Fastafile(ctnnb1_fasta)
    gs = GeneSequence(gene_fa, nuc_context=1)

    # read CTNNB1 bed file
    ctnnb1_bed = os.path.join(file_dir, 'data/CTNNB1.bed')
    bed_list = [b for b in utils.bed_generator(ctnnb1_bed)]
    gs.set_gene(bed_list[0])

    # specify mutation
    coding_pos = [0]
    somatic_base = ['C']

    # check mutation info
    aa_info = mc.get_aa_mut_info(coding_pos, somatic_base, gs)
    ref_codon_msg =  'First codon should be start codon ({0})'.format(aa_info['Reference Codon'][0])
    assert aa_info['Reference Codon'][0] == 'ATG', ref_codon_msg
    assert aa_info['Somatic Codon'][0] == 'CTG', 'First "A" should be replaced with a "C"'
    assert aa_info['Codon Pos'][0] == 0, 'Start codon should be position 0'
mutation_context.py 文件源码 项目:probabilistic2020 作者: KarchinLab 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def recover_unmapped_mut_info(mut_info, bed, sc, opts):
    # retreive info based on annotated protein effects and genomic coordinates
    has_unmapped_opts = ('use_unmapped' in opts) and ('genome' in opts)
    use_unmapped = opts['use_unmapped'] and opts['genome']
    if has_unmapped_opts and use_unmapped:
        genome_fa = pysam.Fastafile(opts['genome'])
        # try to still use mutations that are not on the reference transcript
        tmp_mut_info = mut_info[mut_info['Coding Position'].isnull()]
        unmapped_mut_info = get_unmapped_aa_mut_info(tmp_mut_info,
                                                     genome_fa,
                                                     bed.strand,
                                                     bed.chrom,
                                                     opts['context'])
        genome_fa.close()
        # fill in tumor sample/tumor type info
        unmapped_mut_info['Tumor_Sample'] = tmp_mut_info['Tumor_Sample'].tolist()
        unmapped_mut_info['Tumor_Type'] = tmp_mut_info['Tumor_Type'].tolist()

        # filter out cases where the nucleotide context does not exist
        # on the reference transcript
        bad_contexts = [i for i in range(len(unmapped_mut_info['Context']))
                        if not sc.is_valid_context(unmapped_mut_info['Context'][i])]
        for key in unmapped_mut_info:
            unmapped_mut_info[key] = utils.filter_list(unmapped_mut_info[key],
                                                       bad_contexts)
    else:
        unmapped_mut_info = {'Context': [], 'Reference AA': [], 'Codon Pos': [],
                             'Somatic AA': [], 'Tumor_Allele': [],
                             'Tumor_Sample': [], 'Tumor_Type':[]}
    return unmapped_mut_info
extract_gene_seq.py 文件源码 项目:probabilistic2020 作者: KarchinLab 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def main(opts):
    # read bed file, extract gene sequence from genome, write to fasta
    genome_fa = pysam.Fastafile(opts['input'])
    with open(opts['output'], 'w') as handle:
        for bed_row in utils.bed_generator(opts['bed']):
            fasta_seq = gs.fetch_gene_fasta(bed_row, genome_fa)
            handle.write(fasta_seq)
    genome_fa.close()
data.py 文件源码 项目:CAVA 作者: RahmanTeam 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def __init__(self, options):
        # Openning tabix file representing the reference genome
        self.fastafile = pysam.Fastafile(options.args['reference'])

    # Retrieving the sequence of a genomic region
bakH.py 文件源码 项目:PBSuite 作者: dbrowneup 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def __init__(self, task_queue, result_queue, bamName, referenceName, honH5): #*args, **kwargs):
        multiprocessing.Process.__init__(self)
        self.task_queue = task_queue
        self.result_queue = result_queue
        self.bam = pysam.Samfile(bamName)
        if referenceName is not None:
            self.reference = pysam.Fastafile(referenceName)
        else:
            self.reference = None
        self.honH5 = honH5
        #self.args = args
        #self.kwargs = kwargs
HSpots.py 文件源码 项目:PBSuite 作者: dbrowneup 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def __init__(self, task_queue, result_queue, bamName, referenceName, honH5): #*args, **kwargs):
        multiprocessing.Process.__init__(self)
        self.task_queue = task_queue
        self.result_queue = result_queue
        self.bam = pysam.Samfile(bamName)
        if referenceName is not None:
            self.reference = pysam.Fastafile(referenceName)
        else:
            self.reference = None
        self.honH5 = honH5
        #self.args = args
        #self.kwargs = kwargs
HSpots.py 文件源码 项目:PBSuite 作者: dbrowneup 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def __init__(self, task_queue, result_queue, bamName, referenceName, honH5): #*args, **kwargs):
        multiprocessing.Process.__init__(self)
        self.task_queue = task_queue
        self.result_queue = result_queue
        self.bam = pysam.Samfile(bamName)
        if referenceName is not None:
            self.reference = pysam.Fastafile(referenceName)
        else:
            self.reference = None
        self.honH5 = honH5
        #self.args = args
        #self.kwargs = kwargs
data.py 文件源码 项目:OpEx 作者: RahmanTeam 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def __init__(self,options):
        # Openning tabix file representing the reference genome
        self.fastafile=pysam.Fastafile(options.args['reference'])

    # Retrieving the sequence of a genomic region
mutation_context.py 文件源码 项目:probabilistic2020 作者: KarchinLab 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def is_nonsilent(mut_df, bed_dict, opts):
    # convert dictionary to list for bed objects
    gene_beds = [b
                 for chrom in bed_dict
                 for b in bed_dict[chrom]]

    # initiate gene sequences
    gene_fa = pysam.Fastafile(opts['input'])
    gs = GeneSequence(gene_fa, nuc_context=opts['context'])

    # non-silent SNV classes
    non_silent_snv = ['Nonsense_Mutation', 'Nonstop_Mutation', 'Splice_Site',
                      'Translation_Start_Site', 'Missense_Mutation']

    # record indels and get only snvs
    mut_df['is_nonsilent'] = 0
    indel_flag = indel.is_indel_annotation(mut_df)
    mut_df.loc[indel_flag, 'is_nonsilent'] = 1
    snv_df = mut_df[~indel_flag]

    # iterate over each gene
    for bed in gene_beds:
        # initiate for this gene
        tmp_df = snv_df[snv_df['Gene']==bed.gene_name]
        gs.set_gene(bed)

        # compute context counts and somatic bases for each context
        gene_tuple = compute_mutation_context(bed, gs, tmp_df, opts)
        context_cts, context_to_mutations, mutations_df, gs, sc = gene_tuple

        if len(mutations_df):
            # get snv information
            tmp_mut_info = get_aa_mut_info(mutations_df['Coding Position'],
                                           mutations_df['Tumor_Allele'].tolist(),
                                           gs)

            # get string describing variant
            var_class = cutils.get_variant_classification(tmp_mut_info['Reference AA'],
                                                          tmp_mut_info['Somatic AA'],
                                                          tmp_mut_info['Codon Pos'])

            # detect if non-silent snv
            is_nonsilent_snv = [1 if (x in non_silent_snv) else 0
                                for x in var_class]
            mut_df.loc[tmp_df.index, 'is_nonsilent'] = is_nonsilent_snv

    # return a pandas series indicating nonsilent status
    is_nonsilent_series = mut_df['is_nonsilent'].copy()
    del mut_df['is_nonsilent']
    return is_nonsilent_series
annotate.py 文件源码 项目:probabilistic2020 作者: KarchinLab 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def main(opts):
    # hack to index the FASTA file
    gene_fa = pysam.Fastafile(opts['input'])
    gene_fa.close()

    # Get Mutations
    mut_df = pd.read_csv(opts['mutations'], sep='\t')
    orig_num_mut = len(mut_df)

    # rename columns to fit my internal column names
    rename_dict = {
        'Hugo_Symbol': 'Gene',
        'Tumor_Sample_Barcode': 'Tumor_Sample',
        'Tumor_Seq_Allele2' : 'Tumor_Allele'
    }
    mut_df.rename(columns=rename_dict, inplace=True)

    # restrict to only observed genes if flag present
    restricted_genes = None
    if opts['restrict_genes']:
        restricted_genes = set(mut_df['Gene'].unique())

    # process indels
    indel_df = indel.keep_indels(mut_df)  # return indels only
    indel_df.loc[:, 'Start_Position'] = indel_df['Start_Position'] - 1  # convert to 0-based
    indel_df.loc[:, 'indel len'] = indel_df['indel len'] + 1
    logger.info('There were {0} indels identified.'.format(len(indel_df)))
    mut_df = mut_df.dropna(subset=['Tumor_Allele', 'Start_Position', 'Chromosome'])
    logger.info('Kept {0} mutations after droping mutations with missing '
                'information (Droped: {1})'.format(len(mut_df), orig_num_mut - len(mut_df)))

    # select valid single nucleotide variants only
    mut_df = utils._fix_mutation_df(mut_df, opts['unique'])

    # read in bed info
    bed_dict = utils.read_bed(opts['bed'], restricted_genes)

    # perform permutation
    opts['handle'] = open(opts['output'], 'w')
    multiprocess_permutation(bed_dict, mut_df, opts, indel_df)

    # save indels
    if opts['maf']:
        #with open(opts['output'], 'a') as handle:
        mywriter = csv.writer(opts['handle'], delimiter='\t', lineterminator='\n')
        for maf_lines in indel.simulate_indel_maf(indel_df, bed_dict,
                                                  opts['num_iterations'],
                                                  opts['seed']):
            mywriter.writerows(maf_lines)
    opts['handle'].close()
check_mutations.py 文件源码 项目:probabilistic2020 作者: KarchinLab 项目源码 文件源码 阅读 20 收藏 0 点赞 0 评论 0
def main(opts):
    # read in mutations
    mut_df = pd.read_csv(opts['mutations'], sep='\t')
    orig_num_mut = len(mut_df)

    # correct chromosome names
    mut_df['Chromosome'] = correct_chrom_names(mut_df['Chromosome'])

    # fix additional issues
    mut_df = mut_df.dropna(subset=['Tumor_Allele', 'Start_Position', 'Chromosome'])
    logger.info('Kept {0} mutations after droping mutations with missing '
                'information (Droped: {1})'.format(len(mut_df), orig_num_mut - len(mut_df)))
    mut_df = utils._fix_mutation_df(mut_df)

    # read genome fasta file
    genome_fa = pysam.Fastafile(opts['fasta'])

    # read BED file for transcripts
    bed_dict = utils.read_bed(opts['bed'], [])
    gene2bed = {item.gene_name: item
                for bed_list in bed_dict.values()
                for item in bed_list}

    # group mutations by gene
    mut_grpby = mut_df.groupby('Gene')
    unmapped_mut_list = []
    for i, mut_info in mut_grpby:
        gene_name = mut_info['Gene'].iloc[0]

        # try to find tx annotation for gene
        bed = None
        try:
            bed = gene2bed[gene_name]
        except KeyError:
            pass

        if bed:
            # get coding positions, mutations unmapped to the reference tx will have
            # NA for a coding position
            for ix, row in mut_info.iterrows():
                coding_pos = bed.query_position(bed.strand,
                                                row['Chromosome'],
                                                row['Start_Position'])
                if not coding_pos:
                    unmapped_mut_list.append(row.tolist())
        else:
            #unmapped_mut_df = pd.concat([unmapped_mut_df, mut_info])
            unmapped_mut_list += mut_info.values.tolist()

    # save the unmapped mutations to a file
    unmapped_mut_df = pd.DataFrame(unmapped_mut_list, columns=mut_df.columns)
    logger.info('{0} mutations were unmappable to a '
                'reference transcript'.format(len(unmapped_mut_df)))
    unmapped_mut_df.to_csv(opts['unmapped'], sep='\t', index=False)

    coord_base, coord_strand = detect_coordinates(mut_df, genome_fa)
    logger.info('RESULT: {0}-based coordinates, positions reported on {1} strand'.format(coord_base, coord_strand))

    genome_fa.close()
bakH.py 文件源码 项目:PBSuite 作者: dbrowneup 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test(argv):
    numpy.seterr(all="ignore")
    args = parseArgs(argv)
    setupLogging(True)#keep debug on.. you're testing!
    logging.critical(("Running HSpots.py directly implements testing mode. "
                      "If you're trying to run the full, actual program, use "
                      "Honey.py spots"))

    bam = pysam.Samfile(args.bam)
    reference = pysam.Fastafile(args.reference)
    try:
        if bam.header["HD"]["SO"] != "coordinate":
            logging.warning("BAM is not sorted by coordinates! Performance may be slower")
    except KeyError:
        logging.warning("Assuming BAM is sorted by coordinate. Be sure this is correct")
    logging.info("Running in test mode")

    #do what you will.. from here
    #spot = SpotResult(chrom='11', start=2215290, end=2215798, svtype="DEL", size=208)
    #spot = SpotResult(chrom='22', start=45964261, end=45965596, svtype="DEL", size=-1)
    # This is what I need to start with
    #spot = SpotResult(chrom="22", start=45963975, end=45964532, svtype="DEL", size=57)
    fh = open("honeymissing.bed")
    for line in fh.readlines():
        data = line.strip().split('\t')
        spot = SpotResult(chrom=data[0], start=int(data[1]), end = int(data[2]), \
                          size=int(data[3].split('=')[-1]), svtype="DEL")

        j = SpotCaller('group', spot.chrom, spot.start, spot.end, args)
        if j.supportingReadsFilter(spot, bam, args):
            consen = ConsensusCaller(spot, args)
            consen(bam, reference, 'none')
            for i in consen.newSpots:
                i.tags["seqmade"] = True
                print i
            if len(consen.newSpots) == 0:
                spot.tags["noseq"] = True
                print str(spot)
        else:
            spot.tags["filtfail"] = True
            print str(spot)

    #done with test code
    logging.info("Finished testing")
HSpots.py 文件源码 项目:PBSuite 作者: dbrowneup 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def test(argv):
    numpy.seterr(all="ignore")
    args = parseArgs(argv)
    setupLogging(True)#keep debug on.. you're testing!
    logging.critical(("Running HSpots.py directly implements testing mode. "
                      "If you're trying to run the full, actual program, use "
                      "Honey.py spots"))

    bam = pysam.Samfile(args.bam)
    reference = pysam.Fastafile(args.reference)
    try:
        if bam.header["HD"]["SO"] != "coordinate":
            logging.warning("BAM is not sorted by coordinates! Performance may be slower")
    except KeyError:
        logging.warning("Assuming BAM is sorted by coordinate. Be sure this is correct")
    logging.info("Running in test mode")

    #do what you will.. from here
    # This is what I need to start with
    #spot = SpotResult(chrom="7", start=138402727, end=138402830, svtype="INS", size=113)
    chrom="3"      
    start,end = (195498264, 195498609)
    start -=200
    end +=200
    spot = SpotResult(chrom=chrom, start=start, end=end, svtype="DEL", size=100)

    #fh = open("possible.bed")
    #for line in fh.readlines():
        #data = line.strip().split('\t')
        #spot = SpotResult(chrom=data[0], start=int(data[8]), end = int(data[9]), \
                          #size=int(data[5]), svtype=data[4])

    j = SpotCaller('group', spot.chrom, spot.start, spot.end, args)
    if j.supportingReadsFilter(spot, bam, args):
        consen = ConsensusCaller(spot, args)
        consen(bam, reference, 'none')
        for i in consen.newSpots:
            i.tags["seqmade"] = True
            print i
        if len(consen.newSpots) == 0:
            spot.tags["noseq"] = True
            print str(spot)
    else:
        spot.tags["filtfail"] = True
        print str(spot)
    #done with test code
    logging.info("Finished testing")
__main__.py 文件源码 项目:CNValloc 作者: m1m0r1 项目源码 文件源码 阅读 22 收藏 0 点赞 0 评论 0
def bam2hist(args):
    samfile = pysam.Samfile(args.bam)

    regions = samfile
    (rname, start, end) = parse_region(args.region)

    refseq = None
    if args.genome:
        fasta = pysam.Fastafile(args.genome)
        refseq = fasta.fetch(reference=rname, start=start, end=end)

    if args.with_header:
        print ('rname', 'pos', 'offset',
               'ref',
               'nread', 'major_base', 'major_count', 'minor_count',
               'major_freq', 'minor_freq',
               'N', 'A', 'T', 'C' ,'G', 'D', 'ndel', 'nins',
              sep='\t')

    if args.use_multimaps:
        skip_flag = BAM_FLAGS.unmapped
    else:
        skip_flag = SKIP_FLAG

    for p in samfile.pileup(reference=rname, start=start, end=end, mask=skip_flag):
        if (p.pos < start) or (end is not None and end <= p.pos):  # skip outside of region
            continue

        h = ReadHist()
        for read in p.pileups:
            if read.query_position is None:
                continue
            info = ReadInfo(read, skip_flag=skip_flag)
            if info.skip_flag:
                continue
            h.add_read(info)
        major_base = h.major_base()
        major_count = getattr(h, major_base)
        minor_count = h.total_count() - major_count
        major_freq = h.major_freq()
        if major_freq > args.max_major_freq:
            continue
        print (rname, p.pos + 1, p.pos - start,
               '.' if refseq is None else refseq[p.pos - start],
               p.n, major_base, major_count, minor_count,
               '{0:.3f}'.format(major_freq), '{0:.3f}'.format(1 - major_freq),
               h.N, h.A, h.T, h.C, h.G, h.D, h.ndel, h.nins,
               sep='\t')


问题


面经


文章

微信
公众号

扫码关注公众号