def main(opts):
# read in INDEL mutations
indels = pd.read_csv(opts['input'], sep='\t')
# pysam tabix uses 1-based coordinates
pysam.tabix_index(opts['blacklist'], force=True,
seq_col=0, start_col=1, end_col=2)
# query black list to find INDELs with no hits
non_coding_ixs, coding_ixs = [], []
black_list = pysam.Tabixfile(opts['blacklist'])
for i, row in indels.iterrows():
result = black_list.fetch(reference=row['Chromosome'],
start=row['Start_Position'],
end=row['End_Position'])
if not list(result):
non_coding_ixs.append(i)
else:
coding_ixs.append(i)
black_list.close()
# save non-coding indels
indels.ix[non_coding_ixs, :].to_csv(opts['output'], sep='\t', index=False)
indels.ix[coding_ixs, :].to_csv(opts['blacklist_output'], sep='\t', index=False)
python类Tabixfile()的实例源码
def create_tabix_infile(file_name):
return pysam.Tabixfile(file_name)
def __init__(self, vcffile=None):
self.vcffile = vcffile
self.filename = os.path.splitext(os.path.basename(str(vcffile)))[0]
self.dbnfsp_reader = pysam.Tabixfile(settings.dbnsfp, encoding='utf-8')
self.header = self.dbnfsp_reader.header.__next__().decode('utf8').strip().split('\t')
# create folder snpeff if it doesn't exists
if not os.path.exists('dbnfsp'):
os.makedirs('dbnfsp')
def __prep_tabix(self, path):
if path not in self.readers:
self.readers[path] = Tabixfile(path)
return self.readers[path]
def __init__(self, options, genelist, transcriptlist):
self.options = options
# Openning tabix file representing the Ensembl database
self.tabixfile = pysam.Tabixfile(options.args['ensembl'])
self.proteinSeqs = dict()
self.exonSeqs = dict()
self.genelist = genelist
self.transcriptlist = transcriptlist
# Find transcripts overlapping with a variant
def __init__(self, options):
# Openning tabix file representing the dbSNP database
self.tabixfile = pysam.Tabixfile(options.args['dbsnp'])
# Annotating a variant based on dbSNP data
def __init__(self, threadidx, options, config, startline, endline, names):
multiprocessing.Process.__init__(self)
# Initializing variables
self.threadidx = threadidx
self.options = options
self.config = config
self.startline = startline
self.endline = endline
self.names = names
# Initializing reads directory
self.reads = dict()
# Connecting to BAM file
self.samfile = pysam.Samfile(options.input, "rb")
# Connecting to transcript database file
if not config['transcript_db'] is None:
self.enstdb = pysam.Tabixfile(config['transcript_db'])
else:
self.enstdb = None
# Initializing output files
if int(options.threads) > 1:
if config['outputs']['regions']:
self.out_targets = open(options.output + '_regions_tmp_' + str(threadidx) + '.txt', 'w')
if config['outputs']['profiles']:
self.out_profiles = open(options.output + '_profiles_tmp_' + str(threadidx) + '.txt', 'w')
if not config['transcript_db'] is None:
self.out_poor = open(options.output + '_poor_tmp_' + str(threadidx) + '.txt', 'w')
else:
if config['outputs']['regions']:
self.out_targets = open(options.output + '_regions.txt', 'w')
if config['outputs']['profiles']:
self.out_profiles = open(options.output + '_profiles.txt', 'w')
if not config['transcript_db'] is None:
self.out_poor = open(options.output + '_poor.txt', 'w')
# Checking if target is fail or pass
def __init__(self,options):
self.options=options
# Openning tabix file representing the Ensembl database
self.tabixfile=pysam.Tabixfile(options.args['ensembl'])
self.proteinSeqs=dict()
self.exonSeqs=dict()
# Finding trancsripts that overlap with a variant
def __init__(self,options):
# Openning tabix file representing the dbSNP database
self.tabixfile=pysam.Tabixfile(options.args['dbsnp'])
# Annotating a variant based on dbSNP data
def __init__(self, vcffile=None):
self.vcffile = vcffile
self.filename = os.path.splitext(os.path.basename(str(vcffile)))[0]
# create folder merge if it doesn't exists
if not os.path.exists('merge'):
os.makedirs('merge')
# enter inside folder
os.chdir('merge')
self.annotation_files = OrderedDict()
pysam.tabix_index('../snpeff/snpeff.output.vcf', preset='vcf')
self.annotation_files['snpeff'] = {
'info': 'EFF',
'file': pysam.Tabixfile('../snpeff/snpeff.output.vcf.gz', 'r', encoding="utf-8")
}
pysam.tabix_index('../vep/vep.output.sorted.vcf', preset='vcf')
self.annotation_files['vep'] = {
'info': 'CSQ',
'file': pysam.Tabixfile('../vep/vep.output.sorted.vcf.gz', 'r', encoding="utf-8")
}
pysam.tabix_index('../snpsift/snpsift.final.vcf', preset='vcf')
self.annotation_files['vartype'] = {
'info': 'VARTYPE,SNP,MNP,INS,DEL,MIXED,HOM,HET',
'file': pysam.Tabixfile('../snpsift/snpsift.final.vcf.gz', 'r', encoding="utf-8")
}
pysam.tabix_index('../decipher/hi_predictions.vcf', preset='vcf')
self.annotation_files['decipher'] = {
'info': 'HI_PREDICTIONS',
'file': pysam.Tabixfile('../decipher/hi_predictions.vcf.gz', 'r', encoding="utf-8")
}
pysam.tabix_index('../pynnotator/pynnotator.vcf', preset='vcf')
# genomes1k dbsnp clinvar esp6500 ensembl_phen ensembl_clin
self.pynnotator_tags = ['genomes1k', 'dbsnp', 'clinvar', 'esp6500', 'ensembl_phen', 'ensembl_clin']
self.annotation_files['pynnotator'] = {
'info': 'ALL',
'file': pysam.Tabixfile('../pynnotator/pynnotator.vcf.gz', 'r', encoding="utf-8")
}
pysam.tabix_index('../func_pred/func_pred_sorted.vcf', preset='vcf')
self.annotation_files['dbnfsp'] = {
'info': 'dbNSFP_SIFT_score,dbNSFP_SIFT_converted_rankscore,dbNSFP_SIFT_pred,dbNSFP_Uniprot_acc_Polyphen2,dbNSFP_Uniprot_id_Polyphen2,dbNSFP_Uniprot_aapos_Polyphen2,dbNSFP_Polyphen2_HDIV_score,dbNSFP_Polyphen2_HDIV_rankscore,dbNSFP_Polyphen2_HDIV_pred,dbNSFP_Polyphen2_HVAR_score,dbNSFP_Polyphen2_HVAR_rankscore,dbNSFP_Polyphen2_HVAR_pred,dbNSFP_LRT_score,dbNSFP_LRT_converted_rankscore,dbNSFP_LRT_pred,dbNSFP_LRT_Omega,dbNSFP_MutationTaster_score,dbNSFP_MutationTaster_converted_rankscore,dbNSFP_MutationTaster_pred,dbNSFP_MutationTaster_model,dbNSFP_MutationTaster_AAE,dbNSFP_MutationAssessor_UniprotID,dbNSFP_MutationAssessor_variant,dbNSFP_MutationAssessor_score,dbNSFP_MutationAssessor_rankscore,dbNSFP_MutationAssessor_pred,dbNSFP_FATHMM_score,dbNSFP_FATHMM_converted_rankscore,dbNSFP_FATHMM_pred,dbNSFP_PROVEAN_score,dbNSFP_PROVEAN_converted_rankscore,dbNSFP_PROVEAN_pred,dbNSFP_Transcript_id_VEST3,dbNSFP_Transcript_var_VEST3,dbNSFP_VEST3_score,dbNSFP_VEST3_rankscore,dbNSFP_MetaSVM_score,dbNSFP_MetaSVM_rankscore,dbNSFP_MetaSVM_pred,dbNSFP_MetaLR_score,dbNSFP_MetaLR_rankscore,dbNSFP_MetaLR_pred,dbNSFP_Reliability_index,dbNSFP_M-CAP_score,dbNSFP_M-CAP_rankscore,dbNSFP_M-CAP_pred,dbNSFP_REVEL_score,dbNSFP_REVEL_rankscore,dbNSFP_MutPred_score,dbNSFP_MutPred_rankscore,dbNSFP_MutPred_protID,dbNSFP_MutPred_AAchange,dbNSFP_MutPred_Top5features,dbNSFP_CADD_raw,dbNSFP_CADD_raw_rankscore,dbNSFP_CADD_phred,dbNSFP_DANN_score,dbNSFP_DANN_rankscore,dbNSFP_fathmm-MKL_coding_score,dbNSFP_fathmm-MKL_coding_rankscore,dbNSFP_fathmm-MKL_coding_pred,dbNSFP_fathmm-MKL_coding_group,dbNSFP_Eigen_coding_or_noncoding,dbNSFP_Eigen-raw,dbNSFP_Eigen-phred,dbNSFP_Eigen-PC-raw,dbNSFP_Eigen-PC-phred,dbNSFP_Eigen-PC-raw_rankscore,dbNSFP_GenoCanyon_score,dbNSFP_GenoCanyon_score_rankscore,dbNSFP_integrated_fitCons_score,dbNSFP_integrated_fitCons_rankscore,dbNSFP_integrated_confidence_value,dbNSFP_GM12878_fitCons_score,dbNSFP_GM12878_fitCons_rankscore,dbNSFP_GM12878_confidence_value,dbNSFP_H1-hESC_fitCons_score,dbNSFP_H1-hESC_fitCons_rankscore,dbNSFP_H1-hESC_confidence_value,dbNSFP_HUVEC_fitCons_score,dbNSFP_HUVEC_fitCons_rankscore,dbNSFP_clinvar_rs,dbNSFP_clinvar_clnsig,dbNSFP_clinvar_trait,dbNSFP_clinvar_golden_stars',
'file': pysam.Tabixfile('../func_pred/func_pred_sorted.vcf.gz', 'r', encoding="utf-8")
}
self.dbsnp = pysam.Tabixfile(settings.dbsnp, 'r', encoding="utf-8")
def annotate(self, out_prefix):
# print 'Hello'
# print self.dbnfsp_reader
# header is at:
# 24 SIFT_score: SIFT score (SIFTori).
# 105 HUVEC_confidence_value: 0 - highly significant scores (approx. p<.003); 1 - significant scores
# 188 clinvar_rs: rs number from the clinvar data set
# 191 clinvar_golden_stars: ClinVar Review Status summary.
# print 'input',vcffile, out_prefix, dbnsfp
dbnfsp_reader = pysam.Tabixfile(settings.dbnsfp, 'r', encoding='utf-8')
# print('header')
for item in dbnfsp_reader.header:
header = item.strip().split('\t')
# header = dbnfsp_reader.header.next().strip().split('\t')
vcffile = 'dbnfsp/part.%s.vcf' % (out_prefix)
vcf_reader = open('%s' % (vcffile))
vcf_writer = open('dbnfsp/dbnfsp.%s.vcf' % (out_prefix), 'w', encoding="utf-8")
for line in vcf_reader:
if line.startswith('#'):
if line.startswith('#CHROM'):
vcf_writer.writelines(dbnfsp_header)
vcf_writer.writelines(line)
else:
variant = line.split('\t')
variant[0] = variant[0].replace('chr', '')
index = '%s-%s' % (variant[0], variant[1])
# print index
try:
records = dbnfsp_reader.fetch(variant[0], int(variant[1]) - 1, int(variant[1]))
except:
records = []
for record in records:
ann = record.strip().split('\t')
ispresent = False
if variant[3] == ann[2]:
alts = variant[4].split(',')
alts_ann = ann[3].split(',')
# compare ALT
for alt in alts:
if alt in alts_ann:
ispresent = True
if ispresent:
new_ann = []
for k, item in enumerate(header):
idx = k
if ann[idx] != '.':
new_ann.append('dbNSFP_%s=%s' % (item, ann[idx].replace(';', '|')))
variant[7] = '%s;%s' % (variant[7], ";".join(new_ann))
vcf_writer.writelines("\t".join(variant))
def tabix_file(self, record, reference_tabix,
columns=None, fallback='unknown'):
"""
Retrieve columns from a tabix file
(i.e. gzipped and tabixxed tsv file)
:param record: query VCF record
:param reference_tabix: instance of Tabixfile
:param columns: names of columns to retrieve
:param fallback: fallback value
:return: dict of {column: value}
"""
default = {x: fallback for x in columns}
if record.CHROM.startswith('chr') and not any(
[x.startswith('chr') for x in reference_tabix.contigs]
):
chrom = record.CHROM.replace('chr', '')
else:
chrom = record.CHROM
# adjust range (pysam's ranges are 0-based, half-open)
start, end = record.POS - 1, record.POS
# we ONLY consider the LAST line of any header field. This MUST be tab-delimited
if not hasattr(self, "__tabix_header_{0}".format(reference_tabix)):
setattr(self, "__tabix_header_{0}".format(reference_tabix),
[x for x in reference_tabix.header])
tabix_header = getattr(
self, "__tabix_header_{0}".format(reference_tabix)
)[-1].strip().split("\t")
try:
ref_iter = reference_tabix.fetch(chrom, start=start, end=end)
except ValueError:
return default
ref_piter = PeekableIterator(ref_iter)
if ref_piter.peek() is None:
return default
for ref_rec in ref_piter:
row = ref_rec.split('\t')
assert len(row) == len(tabix_header), \
"{0} {1}".format(len(row), len(tabix_header))
# check POS, REF, ALT ~ only fetch value if all three are the same
# with their record counterpart
if str(record.POS) == row[1] and record.REF == row[2] and \
','.join([str(x) for x in record.ALT]) == row[3]:
for colname in columns:
default[colname] = collapse_values(row[tabix_header.index(colname)])
else:
if ref_piter.peek() is None:
return default
return default
def __init__(self, stream, path=None, tabix_path=None,
record_checks=None, parsed_samples=None):
#: stream (``file``-like object) to read from
self.stream = stream
#: optional ``str`` with the path to the stream
self.path = path
#: optional ``str`` with path to tabix file
self.tabix_path = tabix_path
#: checks to perform on records, can contain 'FORMAT' and 'INFO'
self.record_checks = tuple(record_checks or [])
#: if set, list of samples to parse for
self.parsed_samples = parsed_samples
#: the ``pysam.TabixFile`` used for reading from index bgzip-ed VCF;
#: constructed on the fly
self.tabix_file = None
# the iterator through the Tabix file to use
self.tabix_iter = None
#: the parser to use
self.parser = parser.Parser(stream, self.path, self.record_checks)
#: the Header
self.header = self.parser.parse_header(parsed_samples)
generate-snp-list.py 文件源码
项目:personal-identification-pipeline
作者: TeamErlich
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def __init__(self, filename):
self.tabix_file_name = filename
# TODO: catch TABIX exceptions
self.tabix = pysam.TabixFile(filename)
def IndexedVariantFileReader(phenocode):
filepath = common_filepaths['pheno_gz'](phenocode)
with read_gzip(filepath) as f:
reader = csv.reader(f, dialect='pheweb-internal-dialect')
colnames = next(reader)
assert colnames[0].startswith('#')
colnames[0] = colnames[0][1:]
for field in colnames:
assert field in conf.parse.per_variant_fields or field in conf.parse.per_assoc_fields, (field)
colidxs = {field: colnum for colnum, field in enumerate(colnames)}
with pysam.TabixFile(filepath, parser=None) as tabix_file:
yield _ivfr(tabix_file, colidxs)
def context(self):
with pysam.TabixFile(self._filepath, parser=None) as tabix_file:
yield _mr(tabix_file, self._colidxs, self._colidxs_for_pheno, self._info_for_pheno)
def fetch(self, chrom_or_region, begin=None, end=None):
"""Jump to the start position of the given chromosomal position
and limit iteration to the end position
:param str chrom_or_region: name of the chromosome to jump to if
begin and end are given and a samtools region string otherwise
(e.g. "chr1:123,456-123,900").
:param int begin: 0-based begin position (inclusive)
:param int end: 0-based end position (exclusive)
"""
if begin is not None and end is None:
raise ValueError('begin and end must both be None or neither')
# close tabix file if any and is open
if self.tabix_file and not self.tabix_file.closed:
self.tabix_file.close()
# open tabix file if not yet open
if not self.tabix_file or self.tabix_file.closed:
self.tabix_file = pysam.TabixFile(
filename=self.path, index=self.tabix_path)
# jump to the next position
if begin is None:
self.tabix_iter = self.tabix_file.fetch(region=chrom_or_region)
else:
self.tabix_iter = self.tabix_file.fetch(
reference=chrom_or_region, start=begin, end=end)
return self
def read_tabix(fp, chrom=None, start=None, end=None):
with closing(pysam.TabixFile(fp)) as f:
names = list(f.header) or None
df = pd.read_csv(
io.StringIO('\n'.join(f.fetch(chrom, start, end))),
sep='\t', header=None, names=names)
return df
def load_fragments(options, sample, dataset, chrom=None, start=None, end=None, usecols=None,
min_reads_per_frag=1):
if start is not None:
if start < 0:
raise Exception("start coord is negative: {}:{}-{}".format(chrom, start, end))
if end is not None:
if start >= end:
raise Exception("end coord is before start: {}:{}-{}".format(chrom, start, end))
readclouds_path = os.path.join(
options.results_dir,
"CombineReadcloudsStep",
"readclouds.{}.{}.tsv.gz".format(sample.name, dataset.id))
tabix = pysam.TabixFile(readclouds_path)
if chrom is not None and chrom not in tabix.contigs:
print("MISSING:", chrom)
return pandas.DataFrame(columns="chrom start_pos end_pos bc num_reads obs_len hap".split())
if usecols is not None and "num_reads" not in usecols:
usecols.append("num_reads")
s = StringIO.StringIO("\n".join(tabix.fetch(chrom, start, end)))
readclouds = pandas.read_table(s, header=None, names=Readcloud._fields, usecols=usecols)
readclouds["chrom"] = readclouds["chrom"].astype("string")
if min_reads_per_frag > 0:
readclouds = readclouds.loc[readclouds["num_reads"]>min_reads_per_frag]
return readclouds
def validate(self):
assert os.path.exists(self.bam), "missing bam file '{}' for sample '{}' and dataset '{}'".format(
self.bam, self.sample.name, self.id)
# @staticmethod
# def from_longranger_dir(self, longranger_dir):
# fragments = os.path.join(longranger_dir,
# "PHASER_SVCALLER_CS/PHASER_SVCALLER/_REPORTER/"
# "REPORT_SINGLE_PARTITION/fork0/files/fragments.h5")
# bam = os.path.join(longranger_dir,
# "PHASER_SVCALLER_CS/PHASER_SVCALLER/ATTACH_PHASING/"
# "fork0/files/phased_possorted_bam.bam")
# phased_fragments = os.path.join(longranger_dir,
# "10XSARCOMAC1/PHASER_SVCALLER_CS/PHASER_SVCALLER/"
# "_SNPINDEL_PHASER/PHASE_SNPINDELS/fork0/files/"
# "fragment_phasing.tsv.gz")
# self.validate()
# return TenXDataset(bam, fragments, phased_fragments)
# def load_phased_fragments(self, chrom=None, start=None, end=None):
# columns = ["chrom", "start_pos", "end_pos", "phase_set", "ps_start",
# "ps_end", "bc", "h0", "h1", "hmix", "unkn"]
# try:
# tabix = pysam.TabixFile(self.phased_fragments)
# s = StringIO.StringIO("\n".join(tabix.fetch(chrom, start, end)))
# frags = pandas.read_table(s)
# frags.columns = columns
# except (IOError, ValueError):
# frags = pandas.DataFrame(columns=columns)
# return frags
# def load_fragments(self, chrom=None, start=None, end=None):
# tabix = pysam.TabixFile()
# try:
# fragments = utilities.read_data_frame(self.fragments)
# goodbcs = utilities.get_good_barcodes(fragments)
# fragments = fragments.loc[fragments["bc"].isin(goodbcs)]
# # fragments = fragments.loc[fragments["num_reads"]>5]
# if chrom is not None:
# fragments = fragments.loc[fragments["chrom"]==chrom]
# return fragments
# except:
# logging.exception("Unable to load fragments from fragments file "
# "'{}'".format(self.fragments))
# raise