def show_domain_stats(log, output, top=50):
log['Domain'] = log['url'].apply(get_domain)
by_domain = log.groupby('Domain')
top_domains = (
by_domain.count().sort_values('url', ascending=False)['url'].index)
stats_by_domain = pd.DataFrame(index=top_domains)
stats_by_domain['Pages'] = by_domain.count()['url']
stats_by_domain['Total Score'] = by_domain.sum()['score'].astype(int)
stats_by_domain['Mean Score'] = by_domain.mean()['score']
stats_by_domain['Max Depth'] = by_domain.max()['depth']
stats_by_domain['Median Depth'] = by_domain.median()['depth'].astype(int)
print()
pages = stats_by_domain['Pages']
print('Top {} domains stats (covering {:.1%} pages)'
.format(top, pages[:top].sum() / pages.sum()))
pd.set_option('display.width', 1000)
print(stats_by_domain[:top])
if output:
filename = '{}-by-domain.csv'.format(output)
stats_by_domain.to_csv(filename)
print()
print('Saved domain stats to {}'.format(filename))
response_stats.py 文件源码
python
阅读 37
收藏 0
点赞 0
评论 0
评论列表
文章目录