def __init__(self, fasta_filename, prediction_type, cellgroup_mapping=None):
self._genome_handle = pysam.FastaFile(fasta_filename)
self.prediction_type = prediction_type
if cellgroup_mapping is not None:
self.cellgroup_mapping = cellgroup_mapping
self.C = len(self.cellgroup_mapping)
python类FastaFile()的实例源码
def check_fasta(fa_f, pysam_flag=True):
if not os.path.isfile(fa_f + '.fai'):
pysam.faidx(fa_f)
if pysam_flag: # return pysam FastaFile object
fa = pysam.FastaFile(fa_f)
return fa
else: # return fasta file path
return fa_f
baseline.py 文件源码
项目:DREAM_invivo_tf_binding_prediction_challenge_baseline
作者: nboley
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def iter_seqs(self, fasta_fname):
genome = pysam.FastaFile(fasta_fname)
return (
genome.fetch(contig, start, stop+1).upper()
for contig, start, stop
in self.iter_regions(flank_size=400)
)
def __init__(self, bam:str, reference:str, columns:int=20000, cache:int=None, minor_gaps:bool=False):
"""Interface to samtools tview functionality.
:param bam: path to bam file of reads aligned to a common
reference.
:param reference: path to fasta file of reference to which reads
are aligned.
:param columns: default number of tview columns to read in one go.
:param cache: Currently not implemented.
:param minor_gaps: encoded gaps in non-reference positions distinctly.
"""
self.logger = logging.getLogger('TView')
self._bam = bam
self._reference = reference
self._columns = columns
self._cache = cache
self._minor_gaps = minor_gaps
if self._minor_gaps:
raise NotImplementedError
self._columns_per_base = 3 # reasonable estimate
self._column_pad = 1.1 # multiplier when _columns_per_base is updated
# crosscheck the inputs
with pysam.AlignmentFile(bam, 'rb') as b:
brefs = b.references
blens = b.lengths
rrefs = set(pysam.FastaFile(reference).references)
sbrefs = set(brefs)
if not sbrefs.issuperset(rrefs):
raise ValueError("input bam and reference do not appear consistent: {} vs {}.".format(sbrefs, rrefs))
self._refs = set(brefs)
self._reflens = dict(zip(brefs, blens))
self._pileup = None # buffered `Pileup` object
def load_fasta(fa_path):
"""
A convenient wrapper function for constructing a :py:class:`pysam.FastaFile`
Parameters
----------
fa_path : str
Path to FASTA
Returns
-------
FASTA File Interface : :py:class:`pysam.FastaFile`
"""
return pysam.FastaFile(fa_path)
def get_chrom_length(self, chrom):
"""
Extract chromosome length from fasta.
Parameters
----------
chrom : str
The name of the chromosome or scaffold.
Returns
-------
length : int
The length (integer) of the chromsome/scaffold
Raises
------
RuntimeError
If chromosome name not present in bam header
"""
fastafile = pysam.FastaFile(self.filepath)
lengths = dict(zip(fastafile.references, fastafile.lengths))
try:
lens = lengths[chrom]
fastafile.close()
return lens
except:
self.logger.error(
"{} not present in {}. Exiting.".format(
chrom, self.filepath))
logging.shutdown()
raise RuntimeError(
"Chromosome name not present in fasta. Exiting")
def chromosome_lengths(self):
"""
Returns
-------
tuple
Chromosome lengths ordered by sequence order in fasta
"""
fastafile = pysam.FastaFile(self.filepath)
lengths = fastafile.lengths
fastafile.close()
# pysam's .lengths does not return a tuple (despite what is in the docs),
# so, convert to tuple before returning.
return tuple(lengths)
def chromosome_names(self):
"""
Returns
-------
tuple
Chromosome names ordered by sequence order in fasta
"""
fastafile = pysam.FastaFile(self.filepath)
names = fastafile.references
fastafile.close()
# pysam's .lengths does not return a tuple (despite what is in the docs),
# so, convert to tuple before returning.
return tuple(names)
def __init__(self, filename):
self._fh = FastaFile(filename)
def close(self):
if self._fh:
self._fh.close()
self._fh = None
subprocess.check_call([bgzip_exe, "--force", self._basepath])
os.rename(self._basepath + ".gz", self.filename)
# open file with FastaFile to create indexes, then make all read-only
_fh = FastaFile(self.filename)
_fh.close()
os.chmod(self.filename, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
os.chmod(self.filename + ".fai", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
os.chmod(self.filename + ".gzi", stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
logger.info("{} written; added {} sequences".format(self.filename, len(self._added)))
def covplot(seqfiles, chrom, start, end, fastafile, out=None):
'''
'''
if type(seqfiles) is not list:
seqfiles = [seqfiles]
chrom = str(chrom)
with pysam.FastaFile(fastafile) as handle:
reference = handle.fetch(start=start, end=end, region=chrom)
ref = reference
axis_offset = 75
cov_height = 200
height = len(seqfiles) * cov_height + (len(seqfiles) - 1 ) * 50 + axis_offset
out_type, surface = fileformat(out, width=(end - start) * 10, height=height)
context = cairo.Context(surface)
plot_axis(context, start, end, axis_offset - 10)
plot_grid(context, start, end, axis_offset, height)
for seqfile in seqfiles:
seq = pysam.AlignmentFile(seqfile, 'rb')
coords = OrderedDict({0: -1e9})
reps = ( parse_read(x, coords, ref, start) for x in seq.fetch(chrom, start, end) )
plot_coverage(context, get_coverage(reps), axis_offset, start, (end - start) * 10, cov_height)
axis_offset += cov_height
if seqfiles.index(seqfile) < len(seqfiles) - 1:
insert_spacer(context, coords, start, end)
axis_offset += 50
context.save()
if out_type == 'png':
surface.write_to_png(out)
elif out_type is None:
return surface.write_to_png()
def __init__(self, fasta_file):
self.f = pysam.FastaFile(fasta_file)
def __init__(self, fasta_file):
self.f = pysam.FastaFile(fasta_file)
def bam_pileup_to_hdf(hdf, bams, ref_fasta, ref_name:str=None, start:int=0, end:int=None, truth_bam=None,
chunk_len=50000, overlap=1000):
"""Create an .hdf file containing chunks of a pileup.
:param hdf: output .hdf file (will be overwritten).
:param bams: iterable of input .bam files.
:param ref_fasta: input .fasta file.
:param ref_name: name of reference within .bam to parse.
:param start: start reference coordinate.
:param end: end reference coordinate. If `None` the full extent of
the reference will be parsed.
:param truth_bam: .bam file of truth aligned to ref to generate labels.
:param chunk_len: int, chunk length in reference coordinates.
:param overlap: int, overlap of adjacent chunks in reference coordinates.
..note:: Datasets will be name as `{reference}_{start}_{end}`, indicating
the reference coordinates spanned.
"""
if ref_name is None:
# use all references
refs = pysam.FastaFile(ref_fasta).references
assert end == None
assert start == 0
else:
refs = [ref_name,]
with h5py.File(hdf, 'w') as hdf:
for ref in refs:
if truth_bam is not None:
gen = generate_training_chunks(truth_bam, bams, ref_fasta, ref_name=ref,
start=start, end=end, chunk_len=chunk_len, overlap=overlap)
else:
gen = generate_pileup_chunks(bams, ref_fasta, ref_name=ref, start=start, end=end,
chunk_len=chunk_len, overlap=overlap)
logger.info("Processing reference {}.".format(ref))
for c in gen:
pos = c.pileups[0].positions
chunk_str = '{}_{}_{}'.format(ref, pos['major'][0], pos['major'][-1] + 1)
if c.labels is not None:
# save labels
hdf['{}/labels'.format(chunk_str)] = np.char.encode(c.labels)
if c.ref_seq is not None:
# save ref
hdf['{}/ref_seq'.format(chunk_str)] = np.char.encode(c.ref_seq)
for bam_count, p in enumerate(c.pileups):
assert p.ref_name == ref
grp = '{}/datatype{}'.format(chunk_str, bam_count)
hdf['{}/positions'.format(grp)] = p.positions
hdf[grp].attrs['rname'] = p.ref_name
hdf[grp].attrs['start'] = p.positions['major'][0]
hdf[grp].attrs['end'] = p.positions['major'][-1]
hdf[grp].attrs['bam'] = p.bam
hdf['{}/data'.format(grp)] = p.reads
def seqplot(seqfiles, chrom, start, end, fastafile, out=None, by_strand=False):
''' the plotting function
Args:
seqfiles: list of paths to sequence files
chrom: chromosome to fetch reads from
start: start nucleotide of plotting window.
end: end nucleotide of plotting window.
fastafile: path to reference FASTA file.
out: path to write image file to, or None to return bytes-encoded png
by_strand: whether to shade reads by strand
Returns:
None, or if out is None, returns image plot as bytes-encoded png
'''
if type(seqfiles) is not list:
seqfiles = [seqfiles]
chrom = str(chrom)
with pysam.FastaFile(fastafile) as handle:
reference = handle.fetch(start=start, end=end, region=chrom)
ref = reference
axis_offset = 75
height = get_height(seqfiles, chrom, start, end, axis_offset)
out_type, surface = fileformat(out, width=(end - start) * 10, height=height)
context = cairo.Context(surface)
depths = [axis_offset]
for seqfile in seqfiles:
seq = pysam.AlignmentFile(seqfile, 'rb')
coords = OrderedDict({max(depths): -1e9})
reps = ( parse_read(x, coords, ref, start) for x in seq.fetch(chrom, start, end) )
_plot(context, reps, start, end, axis_offset, height, reference, by_strand)
reference = None # don't plot the reference in subsequent BAMs
if seqfiles.index(seqfile) < len(seqfiles) - 1:
insert_spacer(context, coords, start, end)
depths.append(max(coords))
context.save()
if out_type == 'png':
surface.write_to_png(out)
elif out_type is None:
return surface.write_to_png()
test_position_permutation_test.py 文件源码
项目:probabilistic2020
作者: KarchinLab
项目源码
文件源码
阅读 18
收藏 0
点赞 0
评论 0
def test_ctnnb1_get_aa_mut_info():
import pysam
from prob2020.python.gene_sequence import GeneSequence
# read fasta
ctnnb1_fasta = os.path.join(file_dir, 'data/CTNNB1.fa')
gene_fa = pysam.Fastafile(ctnnb1_fasta)
gs = GeneSequence(gene_fa, nuc_context=1)
# read CTNNB1 bed file
ctnnb1_bed = os.path.join(file_dir, 'data/CTNNB1.bed')
bed_list = [b for b in utils.bed_generator(ctnnb1_bed)]
gs.set_gene(bed_list[0])
# specify mutation
coding_pos = [0]
somatic_base = ['C']
# check mutation info
aa_info = mc.get_aa_mut_info(coding_pos, somatic_base, gs)
ref_codon_msg = 'First codon should be start codon ({0})'.format(aa_info['Reference Codon'][0])
assert aa_info['Reference Codon'][0] == 'ATG', ref_codon_msg
assert aa_info['Somatic Codon'][0] == 'CTG', 'First "A" should be replaced with a "C"'
assert aa_info['Codon Pos'][0] == 0, 'Start codon should be position 0'
def recover_unmapped_mut_info(mut_info, bed, sc, opts):
# retreive info based on annotated protein effects and genomic coordinates
has_unmapped_opts = ('use_unmapped' in opts) and ('genome' in opts)
use_unmapped = opts['use_unmapped'] and opts['genome']
if has_unmapped_opts and use_unmapped:
genome_fa = pysam.Fastafile(opts['genome'])
# try to still use mutations that are not on the reference transcript
tmp_mut_info = mut_info[mut_info['Coding Position'].isnull()]
unmapped_mut_info = get_unmapped_aa_mut_info(tmp_mut_info,
genome_fa,
bed.strand,
bed.chrom,
opts['context'])
genome_fa.close()
# fill in tumor sample/tumor type info
unmapped_mut_info['Tumor_Sample'] = tmp_mut_info['Tumor_Sample'].tolist()
unmapped_mut_info['Tumor_Type'] = tmp_mut_info['Tumor_Type'].tolist()
# filter out cases where the nucleotide context does not exist
# on the reference transcript
bad_contexts = [i for i in range(len(unmapped_mut_info['Context']))
if not sc.is_valid_context(unmapped_mut_info['Context'][i])]
for key in unmapped_mut_info:
unmapped_mut_info[key] = utils.filter_list(unmapped_mut_info[key],
bad_contexts)
else:
unmapped_mut_info = {'Context': [], 'Reference AA': [], 'Codon Pos': [],
'Somatic AA': [], 'Tumor_Allele': [],
'Tumor_Sample': [], 'Tumor_Type':[]}
return unmapped_mut_info
def main(opts):
# read bed file, extract gene sequence from genome, write to fasta
genome_fa = pysam.Fastafile(opts['input'])
with open(opts['output'], 'w') as handle:
for bed_row in utils.bed_generator(opts['bed']):
fasta_seq = gs.fetch_gene_fasta(bed_row, genome_fa)
handle.write(fasta_seq)
genome_fa.close()
def __init__(self, options):
# Openning tabix file representing the reference genome
self.fastafile = pysam.Fastafile(options.args['reference'])
# Retrieving the sequence of a genomic region
def __init__(self, task_queue, result_queue, bamName, referenceName, honH5): #*args, **kwargs):
multiprocessing.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
self.bam = pysam.Samfile(bamName)
if referenceName is not None:
self.reference = pysam.Fastafile(referenceName)
else:
self.reference = None
self.honH5 = honH5
#self.args = args
#self.kwargs = kwargs
def __init__(self, task_queue, result_queue, bamName, referenceName, honH5): #*args, **kwargs):
multiprocessing.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
self.bam = pysam.Samfile(bamName)
if referenceName is not None:
self.reference = pysam.Fastafile(referenceName)
else:
self.reference = None
self.honH5 = honH5
#self.args = args
#self.kwargs = kwargs
def __init__(self, task_queue, result_queue, bamName, referenceName, honH5): #*args, **kwargs):
multiprocessing.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
self.bam = pysam.Samfile(bamName)
if referenceName is not None:
self.reference = pysam.Fastafile(referenceName)
else:
self.reference = None
self.honH5 = honH5
#self.args = args
#self.kwargs = kwargs
def __init__(self,options):
# Openning tabix file representing the reference genome
self.fastafile=pysam.Fastafile(options.args['reference'])
# Retrieving the sequence of a genomic region
def is_nonsilent(mut_df, bed_dict, opts):
# convert dictionary to list for bed objects
gene_beds = [b
for chrom in bed_dict
for b in bed_dict[chrom]]
# initiate gene sequences
gene_fa = pysam.Fastafile(opts['input'])
gs = GeneSequence(gene_fa, nuc_context=opts['context'])
# non-silent SNV classes
non_silent_snv = ['Nonsense_Mutation', 'Nonstop_Mutation', 'Splice_Site',
'Translation_Start_Site', 'Missense_Mutation']
# record indels and get only snvs
mut_df['is_nonsilent'] = 0
indel_flag = indel.is_indel_annotation(mut_df)
mut_df.loc[indel_flag, 'is_nonsilent'] = 1
snv_df = mut_df[~indel_flag]
# iterate over each gene
for bed in gene_beds:
# initiate for this gene
tmp_df = snv_df[snv_df['Gene']==bed.gene_name]
gs.set_gene(bed)
# compute context counts and somatic bases for each context
gene_tuple = compute_mutation_context(bed, gs, tmp_df, opts)
context_cts, context_to_mutations, mutations_df, gs, sc = gene_tuple
if len(mutations_df):
# get snv information
tmp_mut_info = get_aa_mut_info(mutations_df['Coding Position'],
mutations_df['Tumor_Allele'].tolist(),
gs)
# get string describing variant
var_class = cutils.get_variant_classification(tmp_mut_info['Reference AA'],
tmp_mut_info['Somatic AA'],
tmp_mut_info['Codon Pos'])
# detect if non-silent snv
is_nonsilent_snv = [1 if (x in non_silent_snv) else 0
for x in var_class]
mut_df.loc[tmp_df.index, 'is_nonsilent'] = is_nonsilent_snv
# return a pandas series indicating nonsilent status
is_nonsilent_series = mut_df['is_nonsilent'].copy()
del mut_df['is_nonsilent']
return is_nonsilent_series
def main(opts):
# hack to index the FASTA file
gene_fa = pysam.Fastafile(opts['input'])
gene_fa.close()
# Get Mutations
mut_df = pd.read_csv(opts['mutations'], sep='\t')
orig_num_mut = len(mut_df)
# rename columns to fit my internal column names
rename_dict = {
'Hugo_Symbol': 'Gene',
'Tumor_Sample_Barcode': 'Tumor_Sample',
'Tumor_Seq_Allele2' : 'Tumor_Allele'
}
mut_df.rename(columns=rename_dict, inplace=True)
# restrict to only observed genes if flag present
restricted_genes = None
if opts['restrict_genes']:
restricted_genes = set(mut_df['Gene'].unique())
# process indels
indel_df = indel.keep_indels(mut_df) # return indels only
indel_df.loc[:, 'Start_Position'] = indel_df['Start_Position'] - 1 # convert to 0-based
indel_df.loc[:, 'indel len'] = indel_df['indel len'] + 1
logger.info('There were {0} indels identified.'.format(len(indel_df)))
mut_df = mut_df.dropna(subset=['Tumor_Allele', 'Start_Position', 'Chromosome'])
logger.info('Kept {0} mutations after droping mutations with missing '
'information (Droped: {1})'.format(len(mut_df), orig_num_mut - len(mut_df)))
# select valid single nucleotide variants only
mut_df = utils._fix_mutation_df(mut_df, opts['unique'])
# read in bed info
bed_dict = utils.read_bed(opts['bed'], restricted_genes)
# perform permutation
opts['handle'] = open(opts['output'], 'w')
multiprocess_permutation(bed_dict, mut_df, opts, indel_df)
# save indels
if opts['maf']:
#with open(opts['output'], 'a') as handle:
mywriter = csv.writer(opts['handle'], delimiter='\t', lineterminator='\n')
for maf_lines in indel.simulate_indel_maf(indel_df, bed_dict,
opts['num_iterations'],
opts['seed']):
mywriter.writerows(maf_lines)
opts['handle'].close()
def main(opts):
# read in mutations
mut_df = pd.read_csv(opts['mutations'], sep='\t')
orig_num_mut = len(mut_df)
# correct chromosome names
mut_df['Chromosome'] = correct_chrom_names(mut_df['Chromosome'])
# fix additional issues
mut_df = mut_df.dropna(subset=['Tumor_Allele', 'Start_Position', 'Chromosome'])
logger.info('Kept {0} mutations after droping mutations with missing '
'information (Droped: {1})'.format(len(mut_df), orig_num_mut - len(mut_df)))
mut_df = utils._fix_mutation_df(mut_df)
# read genome fasta file
genome_fa = pysam.Fastafile(opts['fasta'])
# read BED file for transcripts
bed_dict = utils.read_bed(opts['bed'], [])
gene2bed = {item.gene_name: item
for bed_list in bed_dict.values()
for item in bed_list}
# group mutations by gene
mut_grpby = mut_df.groupby('Gene')
unmapped_mut_list = []
for i, mut_info in mut_grpby:
gene_name = mut_info['Gene'].iloc[0]
# try to find tx annotation for gene
bed = None
try:
bed = gene2bed[gene_name]
except KeyError:
pass
if bed:
# get coding positions, mutations unmapped to the reference tx will have
# NA for a coding position
for ix, row in mut_info.iterrows():
coding_pos = bed.query_position(bed.strand,
row['Chromosome'],
row['Start_Position'])
if not coding_pos:
unmapped_mut_list.append(row.tolist())
else:
#unmapped_mut_df = pd.concat([unmapped_mut_df, mut_info])
unmapped_mut_list += mut_info.values.tolist()
# save the unmapped mutations to a file
unmapped_mut_df = pd.DataFrame(unmapped_mut_list, columns=mut_df.columns)
logger.info('{0} mutations were unmappable to a '
'reference transcript'.format(len(unmapped_mut_df)))
unmapped_mut_df.to_csv(opts['unmapped'], sep='\t', index=False)
coord_base, coord_strand = detect_coordinates(mut_df, genome_fa)
logger.info('RESULT: {0}-based coordinates, positions reported on {1} strand'.format(coord_base, coord_strand))
genome_fa.close()
def test(argv):
numpy.seterr(all="ignore")
args = parseArgs(argv)
setupLogging(True)#keep debug on.. you're testing!
logging.critical(("Running HSpots.py directly implements testing mode. "
"If you're trying to run the full, actual program, use "
"Honey.py spots"))
bam = pysam.Samfile(args.bam)
reference = pysam.Fastafile(args.reference)
try:
if bam.header["HD"]["SO"] != "coordinate":
logging.warning("BAM is not sorted by coordinates! Performance may be slower")
except KeyError:
logging.warning("Assuming BAM is sorted by coordinate. Be sure this is correct")
logging.info("Running in test mode")
#do what you will.. from here
#spot = SpotResult(chrom='11', start=2215290, end=2215798, svtype="DEL", size=208)
#spot = SpotResult(chrom='22', start=45964261, end=45965596, svtype="DEL", size=-1)
# This is what I need to start with
#spot = SpotResult(chrom="22", start=45963975, end=45964532, svtype="DEL", size=57)
fh = open("honeymissing.bed")
for line in fh.readlines():
data = line.strip().split('\t')
spot = SpotResult(chrom=data[0], start=int(data[1]), end = int(data[2]), \
size=int(data[3].split('=')[-1]), svtype="DEL")
j = SpotCaller('group', spot.chrom, spot.start, spot.end, args)
if j.supportingReadsFilter(spot, bam, args):
consen = ConsensusCaller(spot, args)
consen(bam, reference, 'none')
for i in consen.newSpots:
i.tags["seqmade"] = True
print i
if len(consen.newSpots) == 0:
spot.tags["noseq"] = True
print str(spot)
else:
spot.tags["filtfail"] = True
print str(spot)
#done with test code
logging.info("Finished testing")
def test(argv):
numpy.seterr(all="ignore")
args = parseArgs(argv)
setupLogging(True)#keep debug on.. you're testing!
logging.critical(("Running HSpots.py directly implements testing mode. "
"If you're trying to run the full, actual program, use "
"Honey.py spots"))
bam = pysam.Samfile(args.bam)
reference = pysam.Fastafile(args.reference)
try:
if bam.header["HD"]["SO"] != "coordinate":
logging.warning("BAM is not sorted by coordinates! Performance may be slower")
except KeyError:
logging.warning("Assuming BAM is sorted by coordinate. Be sure this is correct")
logging.info("Running in test mode")
#do what you will.. from here
# This is what I need to start with
#spot = SpotResult(chrom="7", start=138402727, end=138402830, svtype="INS", size=113)
chrom="3"
start,end = (195498264, 195498609)
start -=200
end +=200
spot = SpotResult(chrom=chrom, start=start, end=end, svtype="DEL", size=100)
#fh = open("possible.bed")
#for line in fh.readlines():
#data = line.strip().split('\t')
#spot = SpotResult(chrom=data[0], start=int(data[8]), end = int(data[9]), \
#size=int(data[5]), svtype=data[4])
j = SpotCaller('group', spot.chrom, spot.start, spot.end, args)
if j.supportingReadsFilter(spot, bam, args):
consen = ConsensusCaller(spot, args)
consen(bam, reference, 'none')
for i in consen.newSpots:
i.tags["seqmade"] = True
print i
if len(consen.newSpots) == 0:
spot.tags["noseq"] = True
print str(spot)
else:
spot.tags["filtfail"] = True
print str(spot)
#done with test code
logging.info("Finished testing")
def bam2hist(args):
samfile = pysam.Samfile(args.bam)
regions = samfile
(rname, start, end) = parse_region(args.region)
refseq = None
if args.genome:
fasta = pysam.Fastafile(args.genome)
refseq = fasta.fetch(reference=rname, start=start, end=end)
if args.with_header:
print ('rname', 'pos', 'offset',
'ref',
'nread', 'major_base', 'major_count', 'minor_count',
'major_freq', 'minor_freq',
'N', 'A', 'T', 'C' ,'G', 'D', 'ndel', 'nins',
sep='\t')
if args.use_multimaps:
skip_flag = BAM_FLAGS.unmapped
else:
skip_flag = SKIP_FLAG
for p in samfile.pileup(reference=rname, start=start, end=end, mask=skip_flag):
if (p.pos < start) or (end is not None and end <= p.pos): # skip outside of region
continue
h = ReadHist()
for read in p.pileups:
if read.query_position is None:
continue
info = ReadInfo(read, skip_flag=skip_flag)
if info.skip_flag:
continue
h.add_read(info)
major_base = h.major_base()
major_count = getattr(h, major_base)
minor_count = h.total_count() - major_count
major_freq = h.major_freq()
if major_freq > args.max_major_freq:
continue
print (rname, p.pos + 1, p.pos - start,
'.' if refseq is None else refseq[p.pos - start],
p.n, major_base, major_count, minor_count,
'{0:.3f}'.format(major_freq), '{0:.3f}'.format(1 - major_freq),
h.N, h.A, h.T, h.C, h.G, h.D, h.ndel, h.nins,
sep='\t')