def get_evm_pr(evm_path,ref_fa,out_path):
'''this function get all evm proteins, output to files and merge them together
* evm_path: evm path that has gff file
* ref_fa: reference fa file
* out_path: path to save all temperary files and final protein files
'''
if os.path.exists(out_path):
shutil.rmtree(out_path)
os.mkdir(out_path)
os.chdir(out_path)
evm_gff= evm_path + '/evm.merge.gff'
gff_df = pd.read_csv(evm_gff,sep='\t',header=None)
dic = SeqIO.index(ref_fa,'fasta')
cds_df = gff_df[gff_df[2].values=='CDS']
cds_df = cds_df.reset_index(drop=True)
cds_df['rna_id'] = cds_df[8].map(lambda x: x.split(';')[1][7:])
scaffolds = list(set(cds_df[0].tolist()))
for scaff in scaffolds:
output_cds(scaff,cds_df,dic)
# merge files
fns = natsorted(glob.glob('*.fa'))
sarge.run('cat {fns} > {out}'.format(fns=' '.join(fns),out='pr_merge.fa'))
for f in fns:
os.remove(f)
python类index()的实例源码
def add_gene_function(blast_db,evm_path):
'''add gene symbol to gff file. the information is from the blast results
'''
blastp_fn = blast_db + '/blastp.txt'
blast_df = pd.read_csv(blastp_fn,sep='\t',usecols=[0,1,2],names=['ref','query','per'])
blast_df = blast_df[blast_df['per'].values>50]
blast_df['rna'] = blast_df['ref'].map(lambda x: '.'.join(x.split('.')[-2:]))
blast_df['pr'] = blast_df['query'].map(lambda x: x.split('|')[-1].split('_')[0])
rna_pr_dic = blast_df.set_index('rna')['pr'].to_dict()
evm_gff= evm_path + '/evm.merge.gff'
gff_df = pd.read_csv(evm_gff,sep='\t',header=None)
gff_df[8] = gff_df[8].map(lambda x: add_gene_name(x,rna_pr_dic))
gff_df = gff_df[~gff_df[8].map(lambda x: 'gene=LORF2' in x)]
gff_df.to_csv(blast_db +'/final.gff',sep='\t',index=False)
# add_gene_function(blast_db,evm_path)
#===============================================================================
# process the gmap results and exonerates results directly
#===============================================================================
#=============== 1. get all mapped geneid, rna_accession, pr_accession
def fa2embl(fa,embl,gff,path):
if not os.path.exists(path): os.mkdir(path)
os.chdir(path)
df = pd.read_csv(gff,sep='\t',header=None,comment='#',usecols=[0,2])
df = df[df[2].values=='gene']
chroms = list(set(df[0].tolist()))
dic = SeqIO.index(fa,'fasta')
for s in chroms:
SeqIO.write(dic[s],open('fa','w'),'fasta')
sarge.run('grep \'{s}\' {gff} > gff'.format(s=s,gff=gff))
sarge.run('/home/shangzhong/Installation/EMBOSS-6.6.0/bin/seqret \
-sequence fa -feature -fformat gff -fopenfile1 gff -osformat2 embl \
-auto -outseq {s}.embl'.format(s=s))
fns = glob.glob('*.embl')
sarge.run('cat {files} > {embl}'.format(files=' '.join(fns),embl=embl))
# for f in fns:
# os.remove(f)
# fa2embl('/data/genome/hamster/ncbi_refseq/hamster.fa','hamster.embl','/data/genome/hamster/ncbi_refseq/hamster.gff','/data/shangzhong/Picr_assembly/Annotation/RATT/embl')
def testNewAminoAcid(self):
self.setUpPhylotyper(aa=True)
# Set up subtype files
build_pipeline(self.subtype_options, self.configObj)
# Save setup
self.subtypeOptionsObj.save()
# Check output files
filepaths = self.subtypeOptionsObj.get_subtype_config(self.scheme)
fasta = SeqIO.index(filepaths['alignment'][0], 'fasta')
with open(filepaths['subtype']) as f:
for i, l in enumerate(f):
pass
sd = SeqDict()
sd.load(filepaths['lookup'])
n = 91
self.assertTrue(all([len(fasta) == n, i+1 == n, len(sd.seqs) == n]))
def testNewDNA(self):
self.setUpPhylotyper(aa=False)
# Set up subtype files
build_pipeline(self.subtype_options, self.configObj)
# Save setup
self.subtypeOptionsObj.save()
# Check output files
filepaths = self.subtypeOptionsObj.get_subtype_config(self.scheme)
fasta = SeqIO.index(filepaths['alignment'][0], 'fasta')
with open(filepaths['subtype']) as f:
for i, l in enumerate(f):
pass
sd = SeqDict()
sd.load(filepaths['lookup'])
n = 120
self.assertTrue(all([len(fasta) == n, i+1 == n, len(sd.seqs) == n]))
def _load_strain_sequences(self, strain_gempro):
"""Load strain sequences from the orthology matrix into the base model for comparisons, and into the
strain-specific model itself.
"""
if self._orthology_matrix_has_sequences: # Load directly from the orthology matrix if it contains sequences
strain_sequences = self.df_orthology_matrix[strain_gempro.id].to_dict()
else: # Otherwise load from the genome file if the orthology matrix contains gene IDs
# Load the genome FASTA file
log.debug('{}: loading strain genome CDS file'.format(strain_gempro.genome_path))
strain_sequences = SeqIO.index(strain_gempro.genome_path, 'fasta')
for strain_gene in strain_gempro.genes:
if strain_gene.functional:
if self._orthology_matrix_has_sequences:
strain_gene_key = strain_gene.id
else:
# Pull the gene ID of the strain from the orthology matrix
strain_gene_key = self.df_orthology_matrix.loc[strain_gene.id, strain_gempro.id]
log.debug('{}: original gene ID to be pulled from strain fasta file'.format(strain_gene_key))
# # Load into the base strain for comparisons
ref_gene = self.reference_gempro.genes.get_by_id(strain_gene.id)
new_id = '{}_{}'.format(strain_gene.id, strain_gempro.id)
if ref_gene.protein.sequences.has_id(new_id):
log.debug('{}: sequence already loaded into reference model'.format(new_id))
continue
ref_gene.protein.load_manual_sequence(seq=strain_sequences[strain_gene_key], ident=new_id,
set_as_representative=False)
log.debug('{}: loaded sequence into reference model'.format(new_id))
# Load into the strain GEM-PRO
strain_gene.protein.load_manual_sequence(seq=strain_sequences[strain_gene_key], ident=new_id,
set_as_representative=True)
log.debug('{}: loaded sequence into strain model'.format(new_id))
def build_strain_specific_models(self, save_models=False):
"""Using the orthologous genes matrix, create and modify the strain specific models based on if orthologous
genes exist.
Also store the sequences directly in the reference GEM-PRO protein sequence attribute for the strains.
"""
if len(self.df_orthology_matrix) == 0:
raise RuntimeError('Empty orthology matrix')
# Create an emptied copy of the reference GEM-PRO
for strain_gempro in tqdm(self.strains):
log.debug('{}: building strain specific model'.format(strain_gempro.id))
# For each genome, load the metabolic model or genes from the reference GEM-PRO
logging.disable(logging.WARNING)
if self._empty_reference_gempro.model:
strain_gempro.load_cobra_model(self._empty_reference_gempro.model)
elif self._empty_reference_gempro.genes:
strain_gempro.genes = [x.id for x in self._empty_reference_gempro.genes]
logging.disable(logging.NOTSET)
# Get a list of genes which do not have orthology in the strain
not_in_strain = self.df_orthology_matrix[pd.isnull(self.df_orthology_matrix[strain_gempro.id])][strain_gempro.id].index.tolist()
# Mark genes non-functional
self._pare_down_model(strain_gempro=strain_gempro, genes_to_remove=not_in_strain)
# Load sequences into the base and strain models
self._load_strain_sequences(strain_gempro=strain_gempro)
if save_models:
cobra.io.save_json_model(model=strain_gempro.model,
filename=op.join(self.model_dir, '{}.json'.format(strain_gempro.id)))
strain_gempro.save_pickle(op.join(self.model_dir, '{}_gp.pckl'.format(strain_gempro.id)))
log.info('Created {} new strain-specific models and loaded in sequences'.format(len(self.strains)))
def __init__(self, seq_table, records, max_dist, min_fold, threshold_pval, log=None):
'''
seq_table: pandas.DataFrame
Samples on the columns; sequences on the rows
records: index of Bio.Seq
Indexed, unaligned input sequences. This could come from BioPython's
SeqIO.to_dict or SeqIO.index.
max_dist: float
genetic distance cutoff above which a sequence will not be merged into an OTU
min_fold: float
Multiply the sequence's abundance by this fold to get the minimum abundance
of an OTU for merging
threshold_pval: float
P-value below which a sequence will not be merged into an OTU
log: filehandle
Log file reporting the abundance, genetic, and distribution checks.
'''
self.seq_table = seq_table
self.records = records
self.max_dist = max_dist
self.min_fold = min_fold
self.threshold_pval = threshold_pval
self.log = log
# get a list of the names of the sequences in order of their (decreasing) abundance
self.seq_abunds = self.seq_table.sum(axis=1).sort_values(ascending=False)
# check that all sequence IDs in the table are in the fasta
missing_ids = [seq_id for seq_id in self.seq_abunds.index if seq_id not in self.records]
if len(missing_ids) > 0:
raise RuntimeError("{} sequence IDs found in the sequence table but not in the fasta: {}".format(len(missing_ids), missing_ids))
# initialize OTU information
self.membership = {}
self.otus = []
def _process_record(self, record_id):
'''
Process the next sequence: run the genetic, abundance, and distribution checks, either
merging the sequence into an existing OTU or creating a new OTU.
'''
assert record_id in self.seq_table.index
record = self.records[record_id]
candidate = OTU(record.id, str(record.seq), self.seq_table.loc[record.id])
if self.log is not None:
print('seq', candidate.name, sep='\t', file=self.log)
merged = False
for otu in self.ga_matches(candidate):
test_pval = candidate.distribution_pval(otu)
if self.log is not None:
print(candidate.name, 'distribution_check', otu.name, test_pval, sep='\t', file=self.log)
if test_pval > self.threshold_pval:
otu.absorb(candidate)
self.membership[otu.name].append(candidate.name)
merged = True
break
if not merged:
# form own otu
self.otus.append(candidate)
self.membership[candidate.name] = [candidate.name]
def generate_otu_table(self):
'''
Process all the input sequences to make an OTU table.
returns: pandas.DataFrame
OTU table (which can also be found at instance.otu_table)
'''
for record_id in self.seq_abunds.index:
self._process_record(record_id)
self.otus.sort(key=lambda otu: otu.abundance, reverse=True)
self.otu_table = pd.DataFrame([otu.counts for otu in self.otus], index=[otu.name for otu in self.otus])
self.otu_table.columns = self.seq_table.columns
return self.otu_table
def read_sequence_table(fn):
'''
Read in a table of sequences. Expect a header and the sequence IDs in the
first column. Samples are on the columns.
fn: filename (or handle)
returns: pandas.DataFrame
'''
df = pd.read_table(fn, dtype={0: str}, header=0)
df.index = df.iloc[:,0]
df = df.iloc[:,1:].astype(int)
return df
def call_otus(seq_table_fh, fasta_fh, output_fh, dist_crit, abund_crit, pval_crit, log=None, membership=None):
'''
Read in input files, call OTUs, and return output.
seq_table_fh: filehandle
sequence count table
fasta_fh: filehandle or filename
sequences fasta
output_fh: filehandle
place to write main output OTU table
dist_crit, abund_crit, pval_crit: float
threshold values for distance, abundance, and pvalue
log, membership: filehandles
places to write supplementary output
'''
# read in the sequences table
seq_table = read_sequence_table(seq_table_fh)
# set up the input fasta records
records = SeqIO.index(fasta_fh, 'fasta')
# generate the caller object
caller = DBCaller(seq_table, records, dist_crit, abund_crit, pval_crit, log)
caller.generate_otu_table()
caller.write_otu_table(output_fh)
if membership is not None:
caller.write_membership(membership)
def readGenome(fasta):
genome_dict = SeqIO.index(fasta, "fasta")
print(len(genome_dict))
return(genome_dict)
def augustus_prepare_hint(pasa,exonerate):
'''
'''
dfs = []
for g,t,feature in zip([pasa,exonerate],['E','P'],['exonpart','CDSpart']):
df = pd.read_csv(g,sep='\t',header=None)
df[2] = df[2].map(lambda x: feature)
df[8] = df[8].map(lambda x: x+';grp='+re.search('(?<=ID=).+?(?=;)',x).group(0)+';src='+t)
dfs.append(df)
res = pd.concat(dfs)
res.to_csv('hints.gff',sep='\t',index=False,header=None)
def gene_rna_pr_id(hamster_id,gmap_gff,out_fn):
'''this fnction get all gene rna pr id, including both refseq and gff information.
* hamster_id: a file that has all ids in hamster.gff file
* gmap_gff: gff results mapped using gmap
* out_fn:
'''
# rna accession in gff file
ham_id_df = pd.read_csv(hamster_id,sep='\t',header=0)
ham_id_df = ham_id_df.astype('str')
ham_id_df['TrAccess'] = ham_id_df['TrAccess'].map(lambda x: x.split('.')[0])
ham_id_df['PrAccess'] = ham_id_df['PrAccess'].map(lambda x: x.split('.')[0])
rna_gene_dic = ham_id_df.set_index('TrAccess')['GeneID'].to_dict()
rna_pr_dic = ham_id_df.set_index('TrAccess')['PrAccess'].to_dict()
#-------- read rna gff file
rna_df = pd.read_csv(gmap_gff,sep='\t',header=None,comment='#')
# add rna accession column
rna_df['rna_ac'] = rna_df[8].map(lambda x: re.search('(?<=ID=).+?(?=\.)',x).group(0))
mrna = list(set(rna_df['rna_ac'].tolist()))
# new rna in refseq compared to gff
new_ref_rna = list(set(mrna) - set(rna_gene_dic.keys()))
# get geneid for new ref_rna gene id
for r in new_ref_rna:
handle = Entrez.efetch(db='nucleotide',id=r,rettype='gb',retmode='text').read()
geneid = re.search('(?<=GeneID:).+?(?=\")',handle).group(0)
try:
p = re.search('(?<=protein_id=\").+?(?=\.)',handle).group(0)
except:
p = '-'
rna_gene_dic[r] = geneid
rna_pr_dic[r] = p
# transfer dic to dataframe
r_g_df = pd.DataFrame.from_dict(rna_gene_dic,'index')
r_g_df.columns = ['geneid']
r_p_df = pd.DataFrame.from_dict(rna_pr_dic,'index')
r_p_df.columns = ['pr_ac']
g_r_p_df = pd.concat([r_g_df,r_p_df],axis=1)
g_r_p_df['rna_ac'] = g_r_p_df.index
g_r_p_df[['geneid','rna_ac','pr_ac']].to_csv(out_fn,sep='\t',index=False)
def extract_long_reads():
"""Filter fastq to longest reads."""
parser = argparse.ArgumentParser(description='Extract longest reads from a fastq.')
parser.add_argument('input',
help='Input .fastq file.')
parser.add_argument('output',
help='Output .fastq file.')
parser.add_argument('longest', default=10, type=int,
help='Percentage of longest reads to partition.')
parser.add_argument('--others', default=None,
help='Write all other reads to file.')
args = parser.parse_args()
record_dict = SeqIO.index(args.input, "fastq")
ids = list(record_dict.keys())
lengths = np.fromiter(
(len(record_dict[i]) for i in ids),
dtype=int, count=len(ids)
)
max_reads = len(ids) * (args.longest / 100)
longest = np.argpartition(lengths, -max_reads)[-max_reads:]
SeqIO.write(
(record_dict[ids[i]] for i in longest),
args.output, 'fastq'
)
if args.others is not None:
longest = set(longest)
SeqIO.write(
(record_dict[ids[i]] for i in range(len(ids)) if i not in longest),
args.others, 'fastq'
)
def test_fragment_stats(self):
"""Test the gathering of fragment statistics."""
top = path.dirname(__file__)
bam = path.join(top, "data/test_bam_stats/stat_test.bam")
ref = path.join(top, "data/test_bam_stats/stat_ref.fas")
references = SeqIO.index(ref, format='fasta')
chrom_lengths = {name: len(so) for name, so in six.iteritems(references)}
res = stats.frag_coverage(bam, chrom_lengths, region=None, min_aqual=0, verbose=False)
self.maxDiff = None
target = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
self.assertEqual(list(res['frags_fwd']['seq_0']), target)
def _fasta_to_dict(filename):
"""SeqIO.index wrapper for fasta files"""
return SeqIO.index(filename=filename, format="fasta")
def _clean_index(index):
"""Clean all elements from an indexed fasta"""
index_clean = {}
for key, value in index.items():
index_clean[key] = _clean_seqrecord(value)
return index_clean
def index_fasta(filename):
"""Create a fasta dict, with clean descriptions, key=id, value=seqrecord"""
index = SeqIO.index(
filename=filename,
format="fasta"
)
return _clean_index(index)
def _filter_orthology_matrix(self,
remove_strains_with_no_orthology=True,
remove_strains_with_no_differences=False,
remove_genes_not_in_base_model=True):
"""Filters the orthology matrix by removing genes not in our base model, and also
removes strains from the analysis which have: 0 orthologous genes or no difference from the base strain.
Args:
remove_strains_with_no_orthology (bool): Remove strains which have no orthologous genes found
remove_strains_with_no_differences (bool): Remove strains which have all the same genes as the base model.
Default is False because since orthology is found using a PID cutoff, all genes may be present but
differences may be on the sequence level.
remove_genes_not_in_base_model (bool): Remove genes from the orthology matrix which are not present in our
base model. This happens if we use a genome file for our model that has other genes in it.
"""
if len(self.df_orthology_matrix) == 0:
raise RuntimeError('Empty orthology matrix')
initial_num_strains = len(self.strains)
# Adding names to the row and column of the orthology matrix
self.df_orthology_matrix = self.df_orthology_matrix.rename_axis('gene').rename_axis("strain", axis="columns")
# Gene filtering (of the orthology matrix)
if remove_genes_not_in_base_model:
# Check for gene IDs that are in the model and not in the orthology matrix
# This is probably because: the CDS FASTA file for the base strain did not contain the correct ID
# for the gene and consequently was not included in the orthology matrix
# Save these and report them
reference_strain_gene_ids = [x.id for x in self.reference_gempro.genes]
self.missing_in_orthology_matrix = [x for x in reference_strain_gene_ids if x not in self.df_orthology_matrix.index.tolist()]
self.missing_in_reference_strain = [y for y in self.df_orthology_matrix.index.tolist() if y not in reference_strain_gene_ids]
# Filter the matrix for genes within our base model only
self.df_orthology_matrix = self.df_orthology_matrix[self.df_orthology_matrix.index.isin(reference_strain_gene_ids)]
log.info('Filtered orthology matrix for genes present in base model')
log.warning('{} genes are in your base model but not your orthology matrix, see the attribute "missing_in_orthology_matrix"'.format(len(self.missing_in_orthology_matrix)))
log.warning('{} genes are in the orthology matrix but not your base model, see the attribute "missing_in_reference_strain"'.format(len(self.missing_in_reference_strain)))
# Strain filtering
for strain_gempro in self.strains.copy():
if remove_strains_with_no_orthology:
if strain_gempro.id not in self.df_orthology_matrix.columns:
self.strains.remove(strain_gempro)
log.info('{}: no orthologous genes found for this strain, removed from analysis.'.format(strain_gempro.id))
continue
elif self.df_orthology_matrix[strain_gempro.id].isnull().all():
self.strains.remove(strain_gempro)
log.info('{}: no orthologous genes found for this strain, removed from analysis.'.format(strain_gempro.id))
continue
if remove_strains_with_no_differences:
not_in_strain = self.df_orthology_matrix[pd.isnull(self.df_orthology_matrix[strain_gempro.id])][strain_gempro.id].index.tolist()
if len(not_in_strain) == 0:
self.strains.remove(strain_gempro)
log.info('{}: strain has no differences from the base, removed from analysis.')
continue
log.info('{} strains to be analyzed, {} strains removed'.format(len(self.strains), initial_num_strains - len(self.strains)))