def run(argv):
if not os.path.exists(clean_filepath):
print('dbsnp will be stored at {!r}'.format(clean_filepath))
if not os.path.exists(raw_filepath):
# dbSNP downloads are described at <https://www.ncbi.nlm.nih.gov/variation/docs/human_variation_vcf/>
# This file includes chr-pos-ref-alt-rsid and 4X a bunch of useless columns:
dbsnp_url = 'ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b150_GRCh37p13/VCF/00-All.vcf.gz'
print('Downloading dbsnp!')
make_basedir(raw_filepath)
raw_tmp_filepath = get_tmp_path(raw_filepath)
wget.download(url=dbsnp_url, out=raw_tmp_filepath)
print('')
os.rename(raw_tmp_filepath, raw_filepath)
print('Done downloading.')
print('Converting {} -> {}'.format(raw_filepath, clean_filepath))
make_basedir(clean_filepath)
clean_tmp_filepath = get_tmp_path(clean_filepath)
run_script(r'''
gzip -cd '{raw_filepath}' |
grep -v '^#' |
perl -F'\t' -nale 'print "$F[0]\t$F[1]\t$F[2]\t$F[3]\t$F[4]"' | # Gotta declare that it's tab-delimited, else it's '\s+'-delimited I think.
gzip > '{clean_tmp_filepath}'
'''.format(raw_filepath=raw_filepath, clean_tmp_filepath=clean_tmp_filepath))
os.rename(clean_tmp_filepath, clean_filepath)
print("dbsnp is at '{clean_filepath}'".format(clean_filepath=clean_filepath))
评论列表
文章目录