def digest(fasta_records, enzyme):
"""
Divide a genome into restriction fragments.
Parameters
----------
fasta_records : OrderedDict
Dictionary of chromosome names to sequence records.
enzyme: str
Name of restriction enzyme.
Returns
-------
Dataframe with columns: 'chrom', 'start', 'end'.
"""
import Bio.Restriction as biorst
import Bio.Seq as bioseq
# http://biopython.org/DIST/docs/cookbook/Restriction.html#mozTocId447698
chroms = fasta_records.keys()
try:
cut_finder = getattr(biorst, enzyme).search
except AttributeError:
raise ValueError('Unknown enzyme name: {}'.format(enzyme))
def _each(chrom):
seq = bioseq.Seq(str(fasta_records[chrom]))
cuts = np.r_[0, np.array(cut_finder(seq)) + 1, len(seq)].astype(int)
n_frags = len(cuts) - 1
frags = pd.DataFrame({
'chrom': [chrom] * n_frags,
'start': cuts[:-1],
'end': cuts[1:]},
columns=['chrom', 'start', 'end'])
return frags
return pd.concat(map(_each, chroms), axis=0, ignore_index=True)
评论列表
文章目录