def pickle_vis_data_pandas():
db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
conn = db._create_connection()
df = pd.read_sql('select source_article_id, target_article_id, target_y_coord_1920_1080, target_x_coord_1920_1080, visual_region from link_features', conn)
print len(df)
no_dup = df.sort(['source_article_id','target_y_coord_1920_1080','target_x_coord_1920_1080']).groupby(["source_article_id", "target_article_id"]).first()
print len(no_dup)
feature = no_dup.loc[no_dup['visual_region']=='lead']
print len(feature)
feature.reset_index(inplace=True)
feature = no_dup.loc[no_dup['visual_region']=='infobox']
print len(feature)
feature.reset_index(inplace=True)
feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/infobox.tsv', sep='\t', index=False)
feature = no_dup.loc[no_dup['visual_region']=='navbox']
print len(feature)
feature.reset_index(inplace=True)
feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/navbox.tsv', sep='\t', index=False)
feature = no_dup.loc[no_dup['visual_region']=='left-body']
print len(feature)
feature.reset_index(inplace=True)
feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/left-body.tsv', sep='\t',index=False)
feature = no_dup.loc[no_dup['visual_region']=='body']
print len(feature)
feature.reset_index(inplace=True)
feature[['source_article_id','target_article_id']].to_csv('/home/ddimitrov/tmp/body.tsv', sep='\t',index=False)
评论列表
文章目录