scrub_rescrape.py 文件源码

python
阅读 27 收藏 0 点赞 0 评论 0

项目:glassdoor-analysis 作者: THEdavehogue 项目源码 文件源码
def check_review_counts(ratings_df):
    '''
    Function to check that enough data was collected. Compares number of reviews
    for each target employer with the number of reviews collected

    INPUT:
        ratings_df: Pandas DataFrame containing scraped review text

    OUTPUT:
        good_er_ids, bad_er_ids: Lists of tuples to rescrape from glassdoor
    '''
    clean_df = pd.read_pickle(os.path.join('data', 'clean_employers.pkl'))
    target_ratings = clean_df[['company_name', 'company_id',
                               'num_ratings', 'overall_rating']]
    company_ratings = ratings_df['company_name'].value_counts()
    company_ratings = company_ratings.to_frame(name='ratings_collected')
    company_ratings.reset_index(inplace=True)
    check_df = target_ratings.merge(company_ratings,
                                    how='left',
                                    left_on='company_name',
                                    right_on='index')
    check_df['company_id'] = check_df['company_id'].astype(int)
    check_df.drop('index', axis=1, inplace=True)
    check_df['delta'] = check_df['num_ratings'] - check_df['ratings_collected']
    check_df['delta_pct'] = check_df['delta'] / check_df['num_ratings']
    rescrape = check_df[check_df['delta_pct'] > 0.5]
    good_rescrape = rescrape[rescrape['overall_rating'] > 3.5]
    bad_rescrape = rescrape[rescrape['overall_rating'] < 3.5]
    good_er_ids = zip(good_rescrape['company_name'],
                      good_rescrape['company_id'])
    bad_er_ids = zip(bad_rescrape['company_name'], bad_rescrape['company_id'])
    pickle.dump(good_er_ids,
                open(os.path.join('data', 'rescrape_pros.pkl'), 'wb'))
    pickle.dump(bad_er_ids,
                open(os.path.join('data', 'rescrape_cons.pkl'), 'wb'))
    return good_er_ids, bad_er_ids
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号