def check_review_counts(ratings_df):
'''
Function to check that enough data was collected. Compares number of reviews
for each target employer with the number of reviews collected
INPUT:
ratings_df: Pandas DataFrame containing scraped review text
OUTPUT:
good_er_ids, bad_er_ids: Lists of tuples to rescrape from glassdoor
'''
clean_df = pd.read_pickle(os.path.join('data', 'clean_employers.pkl'))
target_ratings = clean_df[['company_name', 'company_id',
'num_ratings', 'overall_rating']]
company_ratings = ratings_df['company_name'].value_counts()
company_ratings = company_ratings.to_frame(name='ratings_collected')
company_ratings.reset_index(inplace=True)
check_df = target_ratings.merge(company_ratings,
how='left',
left_on='company_name',
right_on='index')
check_df['company_id'] = check_df['company_id'].astype(int)
check_df.drop('index', axis=1, inplace=True)
check_df['delta'] = check_df['num_ratings'] - check_df['ratings_collected']
check_df['delta_pct'] = check_df['delta'] / check_df['num_ratings']
rescrape = check_df[check_df['delta_pct'] > 0.5]
good_rescrape = rescrape[rescrape['overall_rating'] > 3.5]
bad_rescrape = rescrape[rescrape['overall_rating'] < 3.5]
good_er_ids = zip(good_rescrape['company_name'],
good_rescrape['company_id'])
bad_er_ids = zip(bad_rescrape['company_name'], bad_rescrape['company_id'])
pickle.dump(good_er_ids,
open(os.path.join('data', 'rescrape_pros.pkl'), 'wb'))
pickle.dump(bad_er_ids,
open(os.path.join('data', 'rescrape_cons.pkl'), 'wb'))
return good_er_ids, bad_er_ids
评论列表
文章目录