def pickle_correlations_zeros_january():
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
conn = db._create_connection()
print 'read'
df = pd.read_sql('select source_article_id, target_article_id from link_features', conn)
print 'loaded links'
df2 = pd.read_sql('select prev_id, curr_id, counts from clickstream_derived_en_201501 where link_type_derived= "internal-link";', conn)
print 'loaded counts'
result = pd.merge(df, df2, how='left', left_on = ['source_article_id', 'target_article_id'], right_on = ['prev_id', 'curr_id'])
print 'merged counts'
print result
article_counts = result.groupby(by=["target_article_id"])['counts'].sum().reset_index()
article_counts['counts'].fillna(0.0, inplace=True)
print article_counts
print 'write to file'
article_counts[["target_article_id","counts"]].to_csv(TMP+'january_article_counts.tsv', sep='\t', index=False)
评论列表
文章目录