scrub_rescrape.py 文件源码-python代码片段

def check_review_counts(ratings_df):
    '''
    Function to check that enough data was collected. Compares number of reviews
    for each target employer with the number of reviews collected

    INPUT:
        ratings_df: Pandas DataFrame containing scraped review text

    OUTPUT:
        good_er_ids, bad_er_ids: Lists of tuples to rescrape from glassdoor
    '''
    clean_df = pd.read_pickle(os.path.join('data', 'clean_employers.pkl'))
    target_ratings = clean_df[['company_name', 'company_id',
                               'num_ratings', 'overall_rating']]
    company_ratings = ratings_df['company_name'].value_counts()
    company_ratings = company_ratings.to_frame(name='ratings_collected')
    company_ratings.reset_index(inplace=True)
    check_df = target_ratings.merge(company_ratings,
                                    how='left',
                                    left_on='company_name',
                                    right_on='index')
    check_df['company_id'] = check_df['company_id'].astype(int)
    check_df.drop('index', axis=1, inplace=True)
    check_df['delta'] = check_df['num_ratings'] - check_df['ratings_collected']
    check_df['delta_pct'] = check_df['delta'] / check_df['num_ratings']
    rescrape = check_df[check_df['delta_pct'] > 0.5]
    good_rescrape = rescrape[rescrape['overall_rating'] > 3.5]
    bad_rescrape = rescrape[rescrape['overall_rating'] < 3.5]
    good_er_ids = zip(good_rescrape['company_name'],
                      good_rescrape['company_id'])
    bad_er_ids = zip(bad_rescrape['company_name'], bad_rescrape['company_id'])
    pickle.dump(good_er_ids,
                open(os.path.join('data', 'rescrape_pros.pkl'), 'wb'))
    pickle.dump(bad_er_ids,
                open(os.path.join('data', 'rescrape_cons.pkl'), 'wb'))
    return good_er_ids, bad_er_ids