def read_data_table_test(test_tables):
test = pd.read_csv('../input/sample_submission.csv')[['image_name']]
i = 0
for t_path in test_tables:
data = pd.read_csv(t_path)
data.rename(columns={'Type_1': 'Type_1_num_{}'.format(i),
'Type_2': 'Type_2_num_{}'.format(i),
'Type_3': 'Type_3_num_{}'.format(i),
}, inplace=True)
test = pd.merge(test, data, how='left', on='image_name', left_index=True)
i += 1
'''
resolutions = pd.read_csv("../modified_data/resolutions_and_color_features_1.csv")
resolutions = resolutions[resolutions['type'] == 'test']
resolutions.drop(['type'], axis=1, inplace=True)
test = pd.merge(test, resolutions, how='left', on='image_name', left_index=True)
'''
return test
python类merge()的实例源码
a40_run_xgboost_blender.py 文件源码
项目:KAGGLE_CERVICAL_CANCER_2017
作者: ZFTurbo
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def read_data(file_01, file_02):
data_01= pd.read_csv(
file_01,
parse_dates={'timeline': ['btce-time_stamp']},
infer_datetime_format=True)
data_02 = pd.read_csv(
file_02,
parse_dates={'timeline': ['epoch_time_stamp']},
infer_datetime_format=True)
data_02 = data_02.drop_duplicates('epoch')
data_01['timeline'] = data_01['timeline'].astype(float)
data_02['timeline'] = data_02['timeline'].astype(float)
data_ = data_02.set_index('timeline').reindex(data_01.set_index('timeline').index, method='nearest').reset_index()
data = pd.merge(data_01, data_, on='timeline', suffixes=('_', ''))
return data
def digPHconveseRateV2(data_type='train'):
df_output=''
save_path=''
if data_type=='train':
df_output=pd.read_csv('data/cutData/train_time_v7.csv')
save_path='data/cutData/train_time_v8.csv'
elif data_type=='test':
df_output=pd.read_csv('data/first_merge/test_join_v7.csv')
save_path='data/first_merge/test_join_v8.csv'
else:
print('data_type???')
return
df_userPH=pd.read_csv('data/feature/PL_user.csv')
df_output=pd.merge(df_output,df_userPH,how='left',on='userID')
del df_userPH
print('???......')
df_output.to_csv(save_path,index=False)
def predict_test_prob(bst):
df_all=loadCSV('data/first_merge/test_join_v9.csv')
df_sta_lgbm=loadCSV('data/stacking/prob_lgbm_test.csv')
print('????')
df_all=pd.merge(df_all,df_sta_lgbm,how='left',on='instanceID')
del df_sta_lgbm
instanceID=df_all.instanceID.values
feature_all=df_all.drop(['label','clickTime','instanceID',
'residence','appCategory'],axis=1).values
del df_all
dtest=xgb.DMatrix(feature_all)
prob=bst.predict(dtest)
output=pd.DataFrame({'instanceID':instanceID,'prob':prob})
output.to_csv('result/submission2.csv',index=False)
#????
def predict_test_prob(lgbm):
df_all=loadCSV('data/first_merge/test_join_v9.csv')
df_sta_xgb=loadCSV('data/stacking/prob_xgb_test.csv')
print('????')
df_all=pd.merge(df_all,df_sta_xgb,how='left',on='instanceID')
del df_sta_xgb
instanceID=df_all.instanceID.values
feature_all=df_all.drop(['label','clickTime','instanceID',
'residence','appCategory'],axis=1).values
prob = lgbm.predict(feature_all, num_iteration=lgbm.best_iteration)
output=pd.DataFrame({'instanceID':instanceID,'prob':prob})
output.to_csv('result/submission.csv',index=False)
#????
def cut_install():
df_installed=pd.read_csv('data/origin/user_installedapps.csv')
df_app_cat=pd.read_csv('data/origin/app_categories.csv')
print('????')
total=len(df_installed)
p_len=total//10
for i in range(9):
print('?????%d??'%(i+1))
df_part=df_installed[i*p_len:(i+1)*p_len]
df_part=pd.merge(df_part,df_app_cat,how='left',on='appID')
p_name='data/feature/install_cut/cut_p'+str(i+1)+'.csv'
df_part.to_csv(p_name,index=False)
del df_part
print('????????')
df_part=df_installed[9*p_len:]
df_part=pd.merge(df_part,df_app_cat,how='left',on='appID')
df_part.to_csv('data/feature/install_cut/cut_p10.csv',index=False)
del df_part
def iter_over_stocks(stock_lists, db):
final_result = {}
for x in stock_lists:
for y in stock_lists:
data1 = db.select_data(stock_lists[0], begin='2010-01-01', end='2015-12-30')
data_frame = db.get_dataframe('sh600741', begin='2010-01-01', end='2015-12-30')
data_frame = data_frame.set_index(data_frame.date)
data_frame1 = db.get_dataframe('sh601668', begin='2010-01-01', end='2015-12-30')
data_frame1 = data_frame1.set_index(data_frame1.date.values)
base_data1 = raise_value(data_frame)
base_data2 = raise_value(data_frame1)
result = pd.merge(base_data1, base_data2, left_index=True, right_index=True, how='inner')
result = window_similarity(result)
final_result.update({x + y: result})
return final_result
def summary(self):
"""
This function is used to summary the result.
If you want calculate some other indicator, you can add them here.
:return:
"""
if self._analysis is not None:
self._analysis(self.asset_dict)
# for x in self.asset_dict:
# self.get_benchmark()
# asset_return = (self.asset_dict[x] - self._base_fund) / self._base_fund
# asset_return = asset_return.add_prefix(str(x) + "_")
# print asset_return
# result = pd.merge(asset_return, self._benchmark_data,
# left_index=True, right_index=True, how="inner")
# max_return = self.get_max_return(x, begin=self._begin_date, end=self._end_date)
# print max_return
# # print result
# # if self._analysis is not None:
# # self._analysis(result)
# # result.plot()
# # plt.show()
def eval_f12(pred, real):
'''
param:
pred --> dataframe
real --> dataframe
'''
real['label'] = 1
pred = pd.merge(pred, real, on=['user_id', 'sku_id'], how='left')
pred.fillna(0, inplace=True)
p = pred.label.mean()
r = np.sum(pred.label) / real.shape[0]
f12 = (5 * p * r) / (2 * r + 3 * p)
real.drop(['label'], axis=1, inplace=True)
print('<---------------?????--------------->')
print('f12???--->: {0}'.format(p))
print('f12???--->: {0}'.format(r))
print('f12??-->: {0}'.format(f12))
return f12
def load_UCPair_onlyact(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', cate=[8]):
'''
???????????????????
'''
df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'cate'])
df = df.drop_duplicates()
temp = df.groupby(['user_id']).size().reset_index(name='ncate')
df = pd.merge(df, temp, on=['user_id'], how='left')
df = df[df.cate==8]
df['ncate'] = (df['ncate'] == 1).astype(int)
return df[['user_id', 'cate', 'ncate']]
# def get_uid_label(start_date = '2016-02-01 00:00:00', end_date = '2016-04-15 00:00:00'):
# dump_path = './cache/uid_label_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
# if os.path.exists(dump_path):
# with open(dump_path, 'rb') as f:
# df = pickle.load(f)
# else:
# df = get_action_data(start_date=start_date, end_date=end_date, field=['user_id', 'type'])
# df = df[df.type==4].user_id.drop_duplicates().to_frame()
# with open(dump_path, 'wb') as f:
# pickle.dump(df, f)
# return df
def load_brand_comment_ratio(end_date = '2016-04-01 00:00:00'):
'''
?????
'''
dump_path = './cache/brand_comment_ratio_{0}.pkl'.format(end_date[:10])
if os.path.exists(dump_path):
with open(dump_path, 'rb') as f:
df = pickle.load(f)
else:
item_feat = load_base_item_feat(end_date = end_date)
item_feat = item_feat[['sku_id', 'bad_comment_rate']]
brands = get_action_data(start_date = '2016-02-01 00:00:00', end_date = end_date, field=['sku_id', 'brand'])
brands = brands.drop_duplicates()
df = pd.merge(item_feat, brands, on=['sku_id'], how='left')
df = df[['brand', 'bad_comment_rate']]
df = df.groupby(['brand'], as_index=False).mean()
df.columns = ['brand', 'brand_bad_comment_rate']
with open(dump_path, 'wb') as f:
pickle.dump(df, f)
return df
def fetch_feature_1(train_feature_path, finnal_feature_data_path):
import pandas as pd
train_feature_data = pd.read_csv(train_feature_path)
print u'?????????....'
deal_with_user_data()
deal_with_comment_data()
#user_info = delete_user_info_no_1()
user_info = pd.read_csv(user_data_final_path)
comment_info = pd.read_csv(comment_data_final_path)
print u'??????....'
merge_feature_and_user_info = pd.merge(train_feature_data,user_info, on='user_id')
merge_feature_and_user_info_and_comment = pd.merge(merge_feature_and_user_info,comment_info, on='sku_id')
merge_feature_and_user_info_and_comment.to_csv(finnal_feature_data_path, index=False)
print u'???????????:',finnal_feature_data_path
#generate_feature_1.fetch_feature_1(train_one_train_feature_path_pre1,
# train_one_train_feature_path_pre2)
def test():
#??????????????????????
before_2_days_feature = pd.read_csv(one_before_2_days_feature_path)
before_4_days_feature = pd.read_csv(one_before_4_days_feature_path)
before_6_days_feature = pd.read_csv(one_before_6_days_feature_path)
before_8_days_feature = pd.read_csv(one_before_8_days_feature_path)
# ????
print u'??????????',before_2_days_feature.shape
print u'??????????',before_4_days_feature.shape
print u'??????????',before_6_days_feature.shape
print u'??????????',before_8_days_feature.shape
# ????????????
new_data_df1 = pd.merge(before_2_days_feature, before_4_days_feature, on=['user_id','sku_id'], how='outer')
new_data_df2 = pd.merge(before_6_days_feature, before_8_days_feature, on=['user_id','sku_id'], how='outer')
new_data_df = pd.merge(new_data_df1, new_data_df2, on=['user_id','sku_id'], how='outer')
def calculate_query_bin_bits(tfidf): #this also needs to return the table from redis as well as the bin id
table = str2int( ujson.loads( r.get('table') ) )
dim = int( r.get('dim') )
mapping = ujson.loads( r.get('map') )
mapping = pd.DataFrame({'word': mapping})
num_vectors = 16
words = list(tfidf.keys())
values = list(tfidf.values())
tfidf_df = pd.DataFrame({'word': words, 'value': values})
article_representation = pd.merge(mapping, tfidf_df, on='word', how='left').fillna(0)['value']
bin_vectors = generate_random_vectors(num_vectors, dim)
powers_of_two = 1 << np.arange(num_vectors-1, -1, -1)
query_bin_bits = (article_representation.dot(bin_vectors) >= 0)
return query_bin_bits, table
ensemble.py 文件源码
项目:5th_place_solution_facebook_check_ins
作者: aikinogard
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def folderToDict(model_output_path, c, probas=None):
logging.info("merge folder %s" % model_output_path)
files = glob.glob(os.path.join(model_output_path, "*.csv"))
if probas is None:
probas = defaultdict(lambda: defaultdict(float))
for f in files:
logging.info("loading... %s" % f)
df = pd.read_csv(f, dtype={"row_id": int, "place_id": int, "proba": float})
for i in range(len(df)):
probas[df["row_id"][i]][df["place_id"][i]] += c * df["proba"][i]
try:
with open(os.path.join(model_output_path, "map3.txt"), "r") as f_score:
logging.info("map3=%6.6f" % float(f_score.read()))
except:
pass
return probas
ensemble.py 文件源码
项目:5th_place_solution_facebook_check_ins
作者: aikinogard
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def parseDict(probas, output_name, valid_file=None):
df = pd.DataFrame()
df["row_id"] = probas.keys()
df["place_id"] = df["row_id"].apply(lambda x: map(itemgetter(0),
sorted(probas[x].items(), key=itemgetter(1), reverse=True)[:3]))
if valid_file is not None:
df_valid = pd.read_csv(valid_file, usecols=["row_id", "place_id"])
df_valid.rename(columns={"place_id": "place_id_label"}, inplace=True)
df_merge = pd.merge(df, df_valid, how="left", on="row_id")
valid_score = metrics.mapk(df_merge.place_id_label.values[:, None],
df_merge.place_id.values, 3)
logging.info("total validation score: %f" % valid_score)
np.savetxt("%s.txt" % output_name, [valid_score], fmt="%f")
del df_valid
del df_merge
df["place_id"] = df["place_id"].apply(lambda x: " ".join(map(str, x)))
df.to_csv("%s.csv" % output_name, index=False)
ensemble_add.py 文件源码
项目:5th_place_solution_facebook_check_ins
作者: aikinogard
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def folderToDict(model_output_path, c=1., probas=None):
logging.info("merge folder %s" % model_output_path)
files = glob.glob(os.path.join(model_output_path, "*.csv"))
if probas is None:
probas = defaultdict(lambda: defaultdict(float))
for f in files:
logging.info("loading... %s" % f)
df = pd.read_csv(f, dtype={"row_id": int, "place_id": int, "proba": float})
for i in range(len(df)):
probas[df["row_id"][i]][df["place_id"][i]] += c * df["proba"][i]
try:
with open(os.path.join(model_output_path, "map3.txt"), "r") as f_score:
logging.info("map3=%6.6f" % float(f_score.read()))
except:
pass
return probas
def main():
opts = get_parser().parse_args()
rater_1 = pd.read_csv(opts.rater_1)[['participant_id', 'check-1']]
rater_2 = pd.read_csv(opts.rater_2)[['participant_id', 'check-1']]
rater_1.columns = ['participant_id', 'rater_1']
rater_2.columns = ['participant_id', 'rater_2']
merged = pd.merge(rater_1, rater_2, on='participant_id', how='outer')
idcol = 'participant_id'
if opts.mapping_file:
idcol = 'subject_id'
name_mapping = pd.read_csv(
opts.mapping_file, sep=' ', header=None, usecols=[0, 1])
name_mapping.columns = ['subject_id', 'participant_id']
name_mapping['participant_id'] = name_mapping.participant_id.astype(str) + '.gif'
merged = pd.merge(name_mapping, merged, on='participant_id', how='outer')
merged[[idcol, 'rater_1', 'rater_2']].sort_values(by=idcol).to_csv(opts.output, index=False)
consensus.py 文件源码
项目:Comparative-Annotation-Toolkit
作者: ComparativeGenomicsToolkit
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def calculate_improvement_metrics(final_consensus, scored_df, tm_eval_df, hgm_df, metrics):
"""For coding transcripts, how much did we improve the metrics?"""
tm_df = tm_eval_df.reset_index()[['TransMapOriginalIntronsPercent', 'TranscriptId']]
hgm_df_subset = hgm_df[hgm_df['AlignmentId'].apply(tools.nameConversions.aln_id_is_transmap)]
hgm_df_subset = hgm_df_subset[['TranscriptId', 'IntronAnnotSupportPercent', 'IntronRnaSupportPercent']]
tm_df = pd.merge(tm_df, hgm_df_subset, on='TranscriptId')
df = pd.merge(tm_df, scored_df.reset_index(), on='TranscriptId', suffixes=['TransMap', ''])
df = df.drop_duplicates(subset='AlignmentId') # why do I need to do this?
df = df.set_index('AlignmentId')
metrics['Evaluation Improvement'] = {'changes': [], 'unchanged': 0}
for aln_id, c in final_consensus:
if c['transcript_biotype'] != 'protein_coding':
continue
if 'transMap' in c['transcript_modes']:
metrics['Evaluation Improvement']['unchanged'] += 1
continue
tx_s = df.ix[aln_id]
metrics['Evaluation Improvement']['changes'].append([tx_s.TransMapOriginalIntronsPercent,
tx_s.IntronAnnotSupportPercentTransMap,
tx_s.IntronRnaSupportPercentTransMap,
tx_s.OriginalIntronsPercent_mRNA,
tx_s.IntronAnnotSupportPercent,
tx_s.IntronRnaSupportPercent,
tx_s.TransMapGoodness,
tx_s.AlnGoodness_mRNA])
def get_users_orders(self, prior_or_train):
'''
get users' prior detailed orders
'''
if os.path.exists(self.cache_dir + 'users_orders.pkl'):
with open(self.cache_dir + 'users_orders.pkl', 'rb') as f:
users_orders = pickle.load(f)
else:
orders = self.get_orders()
order_products_prior = self.get_orders_items(prior_or_train)
users_orders = pd.merge(order_products_prior, orders[['user_id', 'order_id', 'order_number', 'days_up_to_last']],
on = ['order_id'], how = 'left')
with open(self.cache_dir + 'users_orders.pkl', 'wb') as f:
pickle.dump(users_orders, f, pickle.HIGHEST_PROTOCOL)
return users_orders