def get_assembly_report(self, taxid):
if self.ass_sum is None:
self.get_assembly_summaries()
df = self.ass_sum.query("taxid == {} & refseq_category == 'reference genome'".format(taxid))
if len(df) == 0:
# try "representative genome" (needed for mouse and rat)
df = self.ass_sum.query("taxid == {} & refseq_category == 'representative genome'".format(taxid))
if len(df) != 1:
raise ValueError("unknown reference: {}".format(df))
print(df)
ftp_path = list(df.ftp_path)[0]
assembly = os.path.split(ftp_path)[1]
url = os.path.join(ftp_path, assembly + "_assembly_report.txt")
print(url)
# read the column names from the file
table = request.urlopen(request.Request(url)).read().decode()
names = [x for x in table.split("\n") if x.startswith("#")][-1].strip().replace("# ", "").split("\t")
self.chr_df[taxid] = pd.read_csv(StringIO(table), sep="\t", names=names, comment='#')
self.chr_df[taxid] = self.chr_df[taxid].rename(columns={'Sequence-Name': 'SequenceName', 'Sequence-Role': 'SequenceRole',
'Assigned-Molecule': 'AssignedMolecule',
'Assigned-Molecule-Location/Type': 'AssignedMoleculeLocationType',
'GenBank-Accn': 'GenBankAccn', 'RefSeq-Accn': 'RefSeqAccn',
'UCSC-style-name': 'UCSCstylename'})
#print(self.chr_df[taxid].query("SequenceRole == 'assembled-molecule'"))
评论列表
文章目录