es_download.py 文件源码

python
阅读 17 收藏 0 点赞 0 评论 0

项目:scrapy-cdr 作者: TeamHG-Memex 项目源码 文件源码
def main():
    parser = argparse.ArgumentParser(description='Download items from ES index')
    arg = parser.add_argument
    arg('output', help='output in .jl.gz format')
    arg('index', help='ES index name')
    arg('--domain', help='url.domain to filter')
    arg('--id', help='record id')
    arg('--host', default='localhost', help='ES host in host[:port] format')
    arg('--user', help='HTTP Basic Auth user')
    arg('--password', help='HTTP Basic Auth password')
    arg('--chunk-size', type=int, default=100, help='download chunk size')

    args = parser.parse_args()
    kwargs = {}
    if args.user or args.password:
        kwargs['http_auth'] = (args.user, args.password)

    client = elasticsearch.Elasticsearch(
        [args.host],
        connection_class=elasticsearch.RequestsHttpConnection,
        timeout=600,
        **kwargs)
    print(client.info())

    search = Search(using=client, index=args.index)
    if args.domain:
        search = search.filter('term', **{'url.domain': args.domain})
    if args.id:
        search = search.filter('term', **{'_id': args.id})

    total = 0
    with tqdm.tqdm(total=search.count()) as pbar:
        with gzip.open(args.output, 'wt') as f:
            for x in search.params(size=args.chunk_size).scan():
                total += 1
                pbar.update(1)
                f.write(json.dumps(x.to_dict()))
                f.write('\n')

    print('{:,} items downloaded to {}'.format(total, args.output))
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号