response_stats.py 文件源码

python
阅读 22 收藏 0 点赞 0 评论 0

项目:domain-discovery-crawler 作者: TeamHG-Memex 项目源码 文件源码
def print_scores(response_logs: List[pd.DataFrame], opts):
    joined = pd.concat(response_logs)  # type: pd.DataFrame
    binary_score = joined['score'] > 0.5
    print()
    print('Total number of pages: {:,}, relevant pages: {:,}, '
          'average binary score: {:.2f}, average score: {:.2f}'.format(
            len(joined), binary_score.sum(), binary_score.mean(),
            joined['score'].mean()))
    show_domain_stats(joined.copy(), output=opts.output, top=opts.top)
    joined.sort_values(by='time', inplace=True)
    joined.index = pd.to_datetime(joined.pop('time'), unit='s')
    if opts.smooth:
        crawl_time = (joined.index[-1] - joined.index[0]).total_seconds()
        avg_rps = len(joined) / crawl_time
        span = int(opts.smooth * opts.step * avg_rps)
        joined['score'] = joined['score'].ewm(span=span).mean()
    print_averages({'score': joined['score']}, opts.step, '{:.2f}')
    title = 'Page relevancy score'
    scores = joined['score'].resample('{}S'.format(opts.step)).mean()
    plot = TimeSeries(scores, plot_width=1000,
                      xlabel='time', ylabel='score', title=title)
    plot.set(y_range=Range1d(0, 1))
    if not opts.no_show:
        save_plot(plot, title=title, suffix='score', output=opts.output)
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号