def get_microbe_taxids(force_download=False):
"""
Download the latest bacterial genome assembly summary from the NCBI genome ftp site
and generate a pd.DataFrame of relevant data for strain items based on taxids of the bacterial reference genomes.
:return: pandas dataframe of bacteria reference genome data
"""
if force_download or not os.path.exists("reference_genomes.csv"):
assembly = urllib.request.urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt")
df = pd.read_csv(assembly[0], sep="\t", dtype=object, skiprows=1, header=0)
df = df[df['refseq_category'].isin(['reference genome', 'representative genome'])]
all_tax_wdid = id_mapper('P685')
df['wdid'] = df['taxid'].apply(lambda x: all_tax_wdid.get(x, None))
df = df.rename(columns={'# assembly_accession': 'assembly_accession'})
df.to_csv('reference_genomes.csv', sep="\t")
df.taxid = df.taxid.astype(int)
return df
else: # use predownloaded and parsed flatfile
df = pd.read_csv("reference_genomes.csv", sep="\t", dtype=object, index_col=0)
df.taxid = df.taxid.astype(int)
return df
评论列表
文章目录