def fasta_to_dict(fasta_filename):
# not clear the use of biopython gains anything here
with open(fasta_filename) as f:
return {k: str(v.seq) for k,v in
SeqIO.to_dict(SeqIO.parse(f,'fasta')).iteritems()}
python类to_dict()的实例源码
def main():
records = SeqIO.to_dict(SeqIO.parse(open(sys.argv[1]), 'fasta'))
reader = csv.DictReader(sys.stdin, dialect="excel-tab")
clusters = list(reader)
groups = set([c['group'] for c in clusters])
for group in groups:
print "cluster%s\t%s-cluster%s" % (group, sys.argv[1], group)
with open('%s-cluster%s' %(sys.argv[1], group), 'w') as fout:
SeqIO.write([records[i['node']] for i in clusters if i['group'] == group], fout, 'fasta')
def __init__(self, seq_table, records, max_dist, min_fold, threshold_pval, log=None):
'''
seq_table: pandas.DataFrame
Samples on the columns; sequences on the rows
records: index of Bio.Seq
Indexed, unaligned input sequences. This could come from BioPython's
SeqIO.to_dict or SeqIO.index.
max_dist: float
genetic distance cutoff above which a sequence will not be merged into an OTU
min_fold: float
Multiply the sequence's abundance by this fold to get the minimum abundance
of an OTU for merging
threshold_pval: float
P-value below which a sequence will not be merged into an OTU
log: filehandle
Log file reporting the abundance, genetic, and distribution checks.
'''
self.seq_table = seq_table
self.records = records
self.max_dist = max_dist
self.min_fold = min_fold
self.threshold_pval = threshold_pval
self.log = log
# get a list of the names of the sequences in order of their (decreasing) abundance
self.seq_abunds = self.seq_table.sum(axis=1).sort_values(ascending=False)
# check that all sequence IDs in the table are in the fasta
missing_ids = [seq_id for seq_id in self.seq_abunds.index if seq_id not in self.records]
if len(missing_ids) > 0:
raise RuntimeError("{} sequence IDs found in the sequence table but not in the fasta: {}".format(len(missing_ids), missing_ids))
# initialize OTU information
self.membership = {}
self.otus = []
def extract_genes(output_folder, refs_folder, gff_folder, di):
tp = 'gene'
refs_files = sorted([join(refs_folder, f) for f in listdir(refs_folder) if isfile(join(refs_folder, f))])
gff_files = sorted([join(gff_folder, f) for f in listdir(gff_folder) if isfile(join(gff_folder, f))])
limit_info = dict( gff_type =['gene'])
sequences = []
for ind in range(len(refs_files)):
f_fasta = refs_files[ind]
f_gff = gff_files[ind]
nm = f_gff.split("/")[-1][:-4]
record = SeqIO.to_dict(SeqIO.parse(f_fasta, "fasta"))
new_record = {}
p = re.compile("N\w_\w+\.\d")
for it in record:
new_id = p.findall(it)[0]
new_record[new_id] = record[it]
record = new_record
in_handle = open(f_gff)
for rec in GFF.parse(in_handle, limit_info=limit_info, base_dict=record):
for f in rec.features:
if f.qualifiers['ID'][0] in di.get(f_gff, []):
subrecord = SeqRecord.SeqRecord(Seq.Seq(re.sub(r'[^ATGC]', 'A', str(f.extract(rec.seq)))),
id=nm + "_" + rec.id + "_" + str(f.location),
name=nm + "_" + str(f.location)+ "_" + tp,
description= tp)
sequences.append(subrecord)
in_handle.close()
with open(output_folder+'sc_genes_seq.fasta', "w") as output_handle:
SeqIO.write(sequences, output_handle, "fasta")
def partial_genes_extract(contigs_path, outdir):
contigs = SeqIO.to_dict(SeqIO.parse(contigs_path, "fasta"))
partial_genes = []
with open(outdir+'prodigal_output.gff') as f:
for rec in GFF.parse(f, base_dict=contigs):
for feature in rec.features:
extract_seq = str(feature.extract(rec.seq))
if feature.qualifiers['partial'][0] in ['11', '01', '10'] and len(extract_seq) > 110:
subrecord = SeqRecord.SeqRecord(Seq.Seq(extract_seq),
id='gene_'+str(len(partial_genes))+'_len_'+str(len(extract_seq)) +
'_conf_'+str(feature.qualifiers['conf'][0]),
description='patrial gene')
partial_genes.append(subrecord)
return partial_genes
def write_genes(genes_align, outdir):
records = []
orfs = SeqIO.to_dict(SeqIO.parse(outdir+'ORFs.fasta', 'fasta'))
for gene_cluster in genes_align:
for orf in genes_align[gene_cluster][1]:
if orfs[orf].seq is not None:
records.append(SeqRecord.SeqRecord(seq=orfs[orf].seq, id='{0}_{1}'.format(gene_cluster, orf),
name='{0}_{1}_conf={2}_len={3}'.format(gene_cluster,
orf,
genes_align[gene_cluster][0],
len(orfs[orf]))))
SeqIO.write(records, outdir+'results.fasta', 'fasta')
def add_EVM(whole_fasta_name, output_filename, output_merged_fasta_name):
"""
this module looks for genes that were not used in the consensus stage. usually are gene models without long reads
support
"""
sys.stdout.write('\t###APPEND EVM NOT USED FROM CONTIGS BUILDING###\n')
'''Adds the EVM records that are not present in the final contig evidence'''
whole_fasta = open(whole_fasta_name, 'r')
out_fasta_file = open(output_filename, 'r')
outputMerged = open(output_merged_fasta_name, 'w')
wholeDict = SeqIO.to_dict(SeqIO.parse(whole_fasta, 'fasta'))
count = 0
dictOut = {}
outFasta = SeqIO.parse(out_fasta_file, 'fasta')
for record in outFasta:
if record.id in dictOut:
dictOut[str(record.id) + '_' + str(count)] = str(record.seq)
count += 1
else:
dictOut[record.id] = str(record.seq)
for key in list(wholeDict.keys()):
if 'evm' in key and key not in dictOut:
ident = '>Gene' + str(count) + '_' + key
outputMerged.write(
ident + '\n' + str(wholeDict[key].seq) + '\n')
count += 1
for key, element in list(dictOut.items()):
ident = '>Gene' + str(count) + '_' + key
outputMerged.write(ident + '\n' + str(element) + '\n')
count += 1
whole_fasta.close()
outFasta.close()
outputMerged.close()
def fasta2Dict(fasta_filename):
"""
Prepare a dictionary of all the sequences that is used together with
the fasta file to make single fasta files for the assembly
"""
fasta_file = open(fasta_filename, 'r')
fasta_dict2 = SeqIO.to_dict(SeqIO.parse(fasta_file, 'fasta'))
fasta_dict = {}
for key, seq2 in list(fasta_dict2.items()):
seq = str(seq2.seq).replace("N", "")
fasta_dict[key] = seq
del fasta_dict2[key]
fasta_file.close()
return fasta_dict
def dna_from_reference(chrom='9'):
#reference_hg19 = '/dsde/data/deep/vqsr/Homo_sapiens_assembly19.fasta'
reference_hg19 = '/Users/sam/vqsr_data/Homo_sapiens_assembly19.fasta'
record_dict = SeqIO.to_dict(SeqIO.parse(reference_hg19, "fasta"))
dna = str(record_dict[chrom].seq[10000000:75000000])
return dna
def dna_from_reference(chrom='21'):
reference_hg19 = '/dsde/data/deep/vqsr/Homo_sapiens_assembly19.fasta'
record_dict = SeqIO.to_dict(SeqIO.parse(reference_hg19, "fasta"))
dna = str(record_dict[chrom].seq[35000000:37000000])
return dna
def main():
'''
Back translate aligned amino acid sequences to nucleotide sequences while maintaining the alignment
'''
#creating a parser
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description='Translate amino acid sequences into nucleotide sequences while maintaining the amino acid alignment and the order of sequences.')
#adding arguments
parser.add_argument('-a', metavar='<aa_aln.fasta>', type=str,
help='input amino acid alignment file')
parser.add_argument('-n', metavar='<nuc_aln.fasta>', type=str,
help='input unaligned/aligned nucleotide/codon sequence file')
parser.add_argument('-o', metavar='<codon_aln.fasta>', type=str,
help='output codon alignment file')
args = parser.parse_args()
#set up output file name if none is given
if args.o is None:
nuc_aln_file = "codon_aln.fasta"
else:
nuc_aln_file = args.o
aa_aln_file = args.a
nuc_seq_file = args.n
aa_aln = AlignIO.read(aa_aln_file, "fasta")
nuc_seq = open(nuc_seq_file) #read in codon file
codon_dict = SeqIO.to_dict(SeqIO.parse(nuc_seq, "fasta")) #read in codon sequence file as a dictionary, key=sequence ID, value=sequence itself
back_translate(aa_aln,codon_dict,nuc_aln_file,gencode)
def read_seq_records_dict(input_object, format='fasta'):
"""Read SeqRecord objects to a dictionary from a file in the specified format.
:param input_object: A file object or a file name.
:param format: Input format (fasta by default).
:returns: An iterator of SeqRecord objects.
:rtype: dict
"""
handle = input_object
if type(handle) == str:
handle = open(handle, "rU")
return SeqIO.to_dict(SeqIO.parse(handle, format))
def main(gff_file, fasta_file):
out_file = "%s.gb" % os.path.splitext(gff_file)[0]
fasta_input = SeqIO.to_dict(SeqIO.parse(fasta_file, "fasta", generic_dna))
gff_iter = GFF.parse(gff_file, fasta_input)
SeqIO.write(_check_gff(_fix_ncbi_id(gff_iter)), out_file, "genbank")
def parse_contigs(outputAssembly, threshold_float, verbose):
"""
Parses the output from iAssembler, to a single FASTA file
"""
if verbose:
sys.stderr.write('Executing: Parse assembled consensus\n')
fname = outputAssembly + 'contig_member'
fname_exists = os.path.isfile(fname)
if fname_exists:
# part all the files in the tmp assembly folder
contigInfo = open(outputAssembly + 'contig_member', 'r')
contigs = {}
total_reads = 0
for line in contigInfo:
line = line.strip()
line = line.split('\t')
# count the reads for each assembled Unitig
read_number = len(line) - 1
for element in line:
if 'evm' in element:
contigs[line[0]] = [read_number, element]
if line[0] not in contigs:
contigs[line[0]] = [read_number]
# sum all the reads whitin one cluster
total_reads += read_number
contigInfo.close()
# calculate the number of reads considering the threshold
threshold = total_reads * float(threshold_float)
global count_sequences
real_contigs = {}
count_unitigs = 1
for key, element in list(contigs.items()):
# to retrieve only supported assembly
if len(element) == 2:
if element[0] >= threshold and 'evm' in element[1]:
real_contigs[key] = element[1] + ' ' + str(
element[0]) + '_above_threshold_' + str(threshold) + ' loc_' + str(count_sequences)
elif element[0] < threshold and 'evm' in element[1]:
real_contigs[key] = element[1] + ' ' + str(
element[0]) + '_below_threshold_' + str(threshold) + ' loc_' + str(count_sequences)
elif len(element) == 1:
if element[0] >= threshold:
real_contigs[key] = 'Unitig' + str(count_sequences) + '_' + str(count_unitigs) + ' ' + str(
element[0]) + '_above_threshold_' + str(threshold) + ' loc_' + str(count_sequences)
count_unitigs += 1
# writes the outputs
fileAssembly = outputAssembly + 'unigene_seq.new.fasta'
contigSeq = open(fileAssembly, 'r')
contigDict = SeqIO.to_dict(SeqIO.parse(contigSeq, 'fasta'))
output_filename = outputAssembly[:-1] + '_assembled.fasta'
outputFile = open(output_filename, 'w')
for iden, write_iden in list(real_contigs.items()):
if iden in contigDict:
outputFile.write('>' + write_iden + '\n' + str(contigDict[iden].seq) + '\n')
contigSeq.close()
outputFile.close()
return
def load_dna_and_chrom_label(args, only_labels=None):
record_dict = SeqIO.to_dict(SeqIO.parse(args.reference_fasta, "fasta"))
bed_dict, bed_labels = bed_file_labels_to_dict(args.bed_file)
train_data = np.zeros(( args.samples, args.window_size, len(args.inputs) ))
train_labels = np.zeros(( args.samples, len(bed_labels) ))
idx_offset = (args.window_size/2)
amiguity_codes = {'K':[0,0,0.5,0.5], 'M':[0.5,0.5,0,0], 'R':[0.5,0,0,0.5], 'Y':[0,0.5,0.5,0], 'S':[0,0.5,0,0.5],
'W':[0.5,0,0.5,0], 'B':[0,0.333,0.333,0.334], 'V':[0.333,0.333,0,0.334],'H':[0.333,0.333,0.334,0],
'D':[0.333,0,0.333,0.334], 'X':[0.25,0.25,0.25,0.25], 'N':[0.25,0.25,0.25,0.25]}
count = 0
while count < args.samples:
contig_key, pos = sample_from_bed(bed_dict, contig_key_prefix='chr')
contig = record_dict[contig_key]
record = contig[pos-idx_offset: pos+idx_offset]
cur_label_key = bed_file_label(bed_dict, contig_key, pos)
if only_labels and not cur_label_key in only_labels:
continue
train_labels[count, args.labels[cur_label_key]] = 1
for i,b in enumerate(record.seq):
B=b.upper()
if B in args.inputs.keys():
train_data[count, i, args.inputs[B]] = 1.0
elif B in amiguity_codes.keys():
train_data[count, i, :4] = amiguity_codes[B]
else:
print('Error! Unknown code:', b)
return
count += 1
print('Label:', bed_labels.keys(), 'label counts:', np.sum(train_labels, axis=0))
print('Train data shape:', train_data.shape, ' Training labels shape:', train_labels.shape)
return (train_data, train_labels)
def extract_kmer_seq(kmer_bedfile, hg19_fa_file, outfile_kmers_seq):
'''
open bed file to write the kmer sequence
'''
with open(outfile_kmers_seq,'w') as of:
pass
wf = open(outfile_kmers_seq,'a+')
# read names and postions from bed file
records = SeqIO.to_dict(SeqIO.parse(open(hg19_fa_file), 'fasta'))
with open(kmer_bedfile) as rf:
for line in rf:
name,start,stop,gene,val,strand,rem = bedline_split(line)
start = start
long_seq_record = records[name]
long_seq = long_seq_record.seq
alphabet = long_seq.alphabet
short_seq = str(long_seq)[start-1:stop]
short_seq_record = SeqRecord(Seq(short_seq, alphabet))
short_seq_record.seq.strip()
'''
If the sequence is in reverse strand, rev complement it
'''
if strand == "+":
kmer_seq = short_seq_record.seq
elif strand == "-":
kmer_seq = short_seq_record.seq.reverse_complement()
else:
print("Strand information not found for line:\n {}".format(line))
#quit()
wf.write("{}\t{}\t{}\t{}\t{}\t{}{}\n".format(name,start,stop,gene,kmer_seq,strand,rem))
rf.close()
wf.close()
of.close()
file_chk = check_files(outfile_kmers_seq)
return(file_chk)
def extract_kmer_seq(kmer_bedfile, hg19_fa_file, outfile_kmers_seq):
'''
open bed file to write the kmer sequence
'''
with open(outfile_kmers_seq,'w') as of:
pass
wf = open(outfile_kmers_seq,'a+')
# read names and postions from bed file
records = SeqIO.to_dict(SeqIO.parse(open(hg19_fa_file), 'fasta'))
with open(kmer_bedfile) as rf:
for line in rf:
name,start,stop,gene,val,strand = bedline_split(line)
long_seq_record = records[name]
long_seq = long_seq_record.seq
alphabet = long_seq.alphabet
short_seq = str(long_seq)[start-1:stop]
short_seq_record = SeqRecord(Seq(short_seq, alphabet))
short_seq_record.seq.strip()
'''
If the sequence is in reverse strand, rev complement it
#edit 14-march-2017 (to extract kmers strictly to half open bed format)
'''
if strand == "+":
kmer_seq = short_seq_record.seq
#kmer created will be one base greater than the kmer size, trim
kmer_seq = kmer_seq[:1]
elif strand == "-":
kmer_seq = short_seq_record.seq.reverse_complement()
kmer_seq = kmer_seq[:-1]
else:
print("Strand information not found for line:\n {}".format(line))
quit()
wf.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(name,start,stop,gene,kmer_seq,strand))
rf.close()
wf.close()
file_chk = check_files(outfile_kmers_seq)
return(file_chk)