def __get_remote_index(self):
"""
Gets the index of news crawl files from commoncrawl.org and returns an array of names
:return:
"""
# cleanup
subprocess.getoutput("rm tmpaws.txt")
# get the remote info
cmd = "aws s3 ls --recursive s3://commoncrawl/crawl-data/CC-NEWS/ --no-sign-request > tmpaws.txt && " \
"awk '{ print $4 }' tmpaws.txt && " \
"rm tmpaws.txt"
self.__logger.info('executing: %s', cmd)
stdout_data = subprocess.getoutput(cmd)
print(stdout_data)
lines = stdout_data.splitlines()
return lines
评论列表
文章目录