def parse_gene_id_and_name():
"""return a mapping of genes ids to gene names/description as dictionary
deliberately only looking at protein coding genes
"""
gene_id_to_name = dict()
# modelled after http://genome.crg.es/~lpryszcz/scripts/gb2gtf.py
#allowed_types = set(['gene', 'CDS', 'tRNA', 'tmRNA', 'rRNA', 'ncRNA'])
allowed_types = set(['CDS'])
#wanted_qualifiers = set(['product', 'locus_tag'])
with open(GB_FILE) as fh:
for gb in SeqIO.parse(fh, 'gb'):
for f in gb.features:
if f.type not in allowed_types:
continue
qualifiers = dict(f.qualifiers)
assert len(qualifiers['locus_tag'])==1 and len(qualifiers['product'])==1
gene_id = qualifiers['locus_tag'][0]
gene_name = qualifiers['product'][0]
assert len(qualifiers['locus_tag']) == 1, (qualifiers['locus_tag'])
gene_id_to_name[gene_id] = gene_name
return gene_id_to_name
评论列表
文章目录