python类merge()的实例源码-第2页-面圈网

a40_run_xgboost_blender.py 文件源码项目：KAGGLE_CERVICAL_CANCER_2017 作者: ZFTurbo 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def read_data_table_test(test_tables):
    test = pd.read_csv('../input/sample_submission.csv')[['image_name']]
    i = 0
    for t_path in test_tables:
        data = pd.read_csv(t_path)
        data.rename(columns={'Type_1': 'Type_1_num_{}'.format(i),
                             'Type_2': 'Type_2_num_{}'.format(i),
                             'Type_3': 'Type_3_num_{}'.format(i),
                             }, inplace=True)
        test = pd.merge(test, data, how='left', on='image_name', left_index=True)
        i += 1

    '''
    resolutions = pd.read_csv("../modified_data/resolutions_and_color_features_1.csv")
    resolutions = resolutions[resolutions['type'] == 'test']
    resolutions.drop(['type'], axis=1, inplace=True)
    test = pd.merge(test, resolutions, how='left', on='image_name', left_index=True)
    '''

    return test

data_loader.py 文件源码项目：bitcoin-nn 作者: planetceres 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def read_data(file_01, file_02):
    data_01= pd.read_csv(
        file_01,
        parse_dates={'timeline': ['btce-time_stamp']},
        infer_datetime_format=True)
    data_02 = pd.read_csv(
        file_02,
        parse_dates={'timeline': ['epoch_time_stamp']},
        infer_datetime_format=True)

    data_02 = data_02.drop_duplicates('epoch')
    data_01['timeline'] = data_01['timeline'].astype(float)
    data_02['timeline'] = data_02['timeline'].astype(float)

    data_ = data_02.set_index('timeline').reindex(data_01.set_index('timeline').index, method='nearest').reset_index()
    data = pd.merge(data_01, data_, on='timeline', suffixes=('_', ''))
    return data

ZHLsmooth.py 文件源码项目：Tencent2017_Final_Coda_Allegro 作者: BladeCoda 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def digPHconveseRateV2(data_type='train'):
    df_output=''
    save_path=''
    if data_type=='train':
        df_output=pd.read_csv('data/cutData/train_time_v7.csv')
        save_path='data/cutData/train_time_v8.csv'
    elif data_type=='test':
        df_output=pd.read_csv('data/first_merge/test_join_v7.csv')
        save_path='data/first_merge/test_join_v8.csv'
    else:
        print('data_type???')
        return

    df_userPH=pd.read_csv('data/feature/PL_user.csv')
    df_output=pd.merge(df_output,df_userPH,how='left',on='userID')
    del df_userPH

    print('???......')

    df_output.to_csv(save_path,index=False)

XGBoostTest.py 文件源码项目：Tencent2017_Final_Coda_Allegro 作者: BladeCoda 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def predict_test_prob(bst):
    df_all=loadCSV('data/first_merge/test_join_v9.csv') 

    df_sta_lgbm=loadCSV('data/stacking/prob_lgbm_test.csv') 
    print('????')
    df_all=pd.merge(df_all,df_sta_lgbm,how='left',on='instanceID')
    del df_sta_lgbm

    instanceID=df_all.instanceID.values
    feature_all=df_all.drop(['label','clickTime','instanceID',
                             'residence','appCategory'],axis=1).values

    del df_all

    dtest=xgb.DMatrix(feature_all)
    prob=bst.predict(dtest)

    output=pd.DataFrame({'instanceID':instanceID,'prob':prob})

    output.to_csv('result/submission2.csv',index=False) 

#????

LightGBMExam.py 文件源码项目：Tencent2017_Final_Coda_Allegro 作者: BladeCoda 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def predict_test_prob(lgbm):
    df_all=loadCSV('data/first_merge/test_join_v9.csv') 

    df_sta_xgb=loadCSV('data/stacking/prob_xgb_test.csv') 
    print('????')
    df_all=pd.merge(df_all,df_sta_xgb,how='left',on='instanceID')
    del df_sta_xgb   
    instanceID=df_all.instanceID.values
    feature_all=df_all.drop(['label','clickTime','instanceID',
                             'residence','appCategory'],axis=1).values

    prob = lgbm.predict(feature_all, num_iteration=lgbm.best_iteration)

    output=pd.DataFrame({'instanceID':instanceID,'prob':prob})

    output.to_csv('result/submission.csv',index=False) 

#????

countInstall.py 文件源码项目：Tencent2017_Final_Coda_Allegro 作者: BladeCoda 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def cut_install():
    df_installed=pd.read_csv('data/origin/user_installedapps.csv')  
    df_app_cat=pd.read_csv('data/origin/app_categories.csv')
    print('????')
    total=len(df_installed)
    p_len=total//10
    for i in range(9):
        print('?????%d??'%(i+1))
        df_part=df_installed[i*p_len:(i+1)*p_len]
        df_part=pd.merge(df_part,df_app_cat,how='left',on='appID')
        p_name='data/feature/install_cut/cut_p'+str(i+1)+'.csv'
        df_part.to_csv(p_name,index=False)
        del df_part
    print('????????')
    df_part=df_installed[9*p_len:]
    df_part=pd.merge(df_part,df_app_cat,how='left',on='appID')
    df_part.to_csv('data/feature/install_cut/cut_p10.csv',index=False)
    del df_part

test_cor.py 文件源码项目：lquant 作者: squall1988 项目源码文件源码阅读 38 收藏 0 点赞 0 评论 0

def iter_over_stocks(stock_lists, db):
    final_result = {}
    for x in stock_lists:
        for y in stock_lists:
            data1 = db.select_data(stock_lists[0], begin='2010-01-01', end='2015-12-30')
            data_frame = db.get_dataframe('sh600741', begin='2010-01-01', end='2015-12-30')
            data_frame = data_frame.set_index(data_frame.date)

            data_frame1 = db.get_dataframe('sh601668', begin='2010-01-01', end='2015-12-30')
            data_frame1 = data_frame1.set_index(data_frame1.date.values)



            base_data1 = raise_value(data_frame)
            base_data2 = raise_value(data_frame1)

            result = pd.merge(base_data1, base_data2, left_index=True, right_index=True, how='inner')
            result = window_similarity(result)
            final_result.update({x + y: result})
    return final_result

backtest_base.py 文件源码项目：lquant 作者: squall1988 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def summary(self):
        """
        This function is used to summary the result.
        If you want calculate some other indicator, you can add them here.
        :return:
        """
        if self._analysis is not None:
            self._analysis(self.asset_dict)
        # for x in self.asset_dict:
        #     self.get_benchmark()
        #     asset_return = (self.asset_dict[x] - self._base_fund) / self._base_fund
        #     asset_return = asset_return.add_prefix(str(x) + "_")
        #     print asset_return
        #     result = pd.merge(asset_return, self._benchmark_data,
        #                       left_index=True, right_index=True, how="inner")
        #     max_return = self.get_max_return(x, begin=self._begin_date, end=self._end_date)
        #     print max_return
        #     # print result
        #     # if self._analysis is not None:
        #     #     self._analysis(result)
        #     # result.plot()
        #     # plt.show()

tools.py 文件源码项目：JData-algorithm-competition 作者: wrzto 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def eval_f12(pred, real):
    '''
    param:
        pred --> dataframe
        real --> dataframe
    '''
    real['label'] = 1
    pred = pd.merge(pred, real, on=['user_id', 'sku_id'], how='left')
    pred.fillna(0, inplace=True)
    p = pred.label.mean()
    r = np.sum(pred.label) / real.shape[0]

    f12 = (5 * p * r) / (2 * r + 3 * p)

    real.drop(['label'], axis=1, inplace=True)
    print('<---------------?????--------------->')
    print('f12???--->: {0}'.format(p))
    print('f12???--->: {0}'.format(r))
    print('f12??-->: {0}'.format(f12))

    return f12

features_generator.py 文件源码项目：JData-algorithm-competition 作者: wrzto 项目源码文件源码阅读 30 收藏 0 点赞 0 评论 0

def load_UCPair_onlyact(start_date = '2016-02-01 00:00:00', end_date = '2016-04-16 00:00:00', cate=[8]):
    '''
    ???????????????????
    '''
    df = get_action_data(start_date = start_date, end_date = end_date, field=['user_id', 'cate'])
    df = df.drop_duplicates()
    temp = df.groupby(['user_id']).size().reset_index(name='ncate')
    df = pd.merge(df, temp, on=['user_id'], how='left')
    df = df[df.cate==8]
    df['ncate'] = (df['ncate'] == 1).astype(int)

    return df[['user_id', 'cate', 'ncate']]

# def get_uid_label(start_date = '2016-02-01 00:00:00', end_date = '2016-04-15 00:00:00'):
#     dump_path = './cache/uid_label_{0}_{1}.pkl'.format(start_date[:10], end_date[:10])
#     if os.path.exists(dump_path):
#         with open(dump_path, 'rb') as f:
#             df = pickle.load(f)
#     else:
#         df = get_action_data(start_date=start_date, end_date=end_date, field=['user_id', 'type'])
#         df = df[df.type==4].user_id.drop_duplicates().to_frame()
#         with open(dump_path, 'wb') as f:
#             pickle.dump(df, f)
#     return df

features_generator.py 文件源码项目：JData-algorithm-competition 作者: wrzto 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def load_brand_comment_ratio(end_date = '2016-04-01 00:00:00'):
    '''
    ?????
    '''
    dump_path = './cache/brand_comment_ratio_{0}.pkl'.format(end_date[:10])
    if os.path.exists(dump_path):
        with open(dump_path, 'rb') as f:
            df = pickle.load(f)
    else:
        item_feat = load_base_item_feat(end_date = end_date)
        item_feat = item_feat[['sku_id', 'bad_comment_rate']]
        brands = get_action_data(start_date = '2016-02-01 00:00:00', end_date = end_date, field=['sku_id', 'brand'])
        brands = brands.drop_duplicates()
        df = pd.merge(item_feat, brands, on=['sku_id'], how='left')
        df = df[['brand', 'bad_comment_rate']]
        df = df.groupby(['brand'], as_index=False).mean()
        df.columns = ['brand', 'brand_bad_comment_rate']
        with open(dump_path, 'wb') as f:
            pickle.dump(df, f)

    return df

generate_feature_1.py 文件源码项目：JData 作者: ottsion 项目源码文件源码阅读 27 收藏 0 点赞 0 评论 0

def fetch_feature_1(train_feature_path, finnal_feature_data_path):
    import pandas as pd
    train_feature_data = pd.read_csv(train_feature_path)
    print u'?????????....'
    deal_with_user_data()
    deal_with_comment_data()
    #user_info = delete_user_info_no_1()
    user_info = pd.read_csv(user_data_final_path)
    comment_info = pd.read_csv(comment_data_final_path)
    print u'??????....'
    merge_feature_and_user_info = pd.merge(train_feature_data,user_info, on='user_id')
    merge_feature_and_user_info_and_comment = pd.merge(merge_feature_and_user_info,comment_info, on='sku_id')
    merge_feature_and_user_info_and_comment.to_csv(finnal_feature_data_path, index=False)
    print u'???????????:',finnal_feature_data_path

    #generate_feature_1.fetch_feature_1(train_one_train_feature_path_pre1, 
    #                                   train_one_train_feature_path_pre2)

generate_feature_2.py 文件源码项目：JData 作者: ottsion 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def test():
    #??????????????????????
    before_2_days_feature = pd.read_csv(one_before_2_days_feature_path)
    before_4_days_feature = pd.read_csv(one_before_4_days_feature_path)
    before_6_days_feature = pd.read_csv(one_before_6_days_feature_path)
    before_8_days_feature = pd.read_csv(one_before_8_days_feature_path)
    # ????
    print u'??????????',before_2_days_feature.shape
    print u'??????????',before_4_days_feature.shape
    print u'??????????',before_6_days_feature.shape
    print u'??????????',before_8_days_feature.shape

    # ????????????


    new_data_df1 = pd.merge(before_2_days_feature, before_4_days_feature, on=['user_id','sku_id'], how='outer')
    new_data_df2 = pd.merge(before_6_days_feature, before_8_days_feature, on=['user_id','sku_id'], how='outer')
    new_data_df = pd.merge(new_data_df1, new_data_df2, on=['user_id','sku_id'], how='outer')

tfidf.py 文件源码项目：newsgraph 作者: exchez 项目源码文件源码阅读 32 收藏 0 点赞 0 评论 0

def calculate_query_bin_bits(tfidf): #this also needs to return the table from redis as well as the bin id
    table = str2int( ujson.loads( r.get('table') ) )
    dim = int( r.get('dim') )
    mapping = ujson.loads( r.get('map') )
    mapping = pd.DataFrame({'word': mapping})
    num_vectors = 16

    words = list(tfidf.keys())
    values = list(tfidf.values())
    tfidf_df = pd.DataFrame({'word': words, 'value': values})

    article_representation = pd.merge(mapping, tfidf_df, on='word', how='left').fillna(0)['value']

    bin_vectors = generate_random_vectors(num_vectors, dim)
    powers_of_two = 1 << np.arange(num_vectors-1, -1, -1)
    query_bin_bits = (article_representation.dot(bin_vectors) >= 0)

    return query_bin_bits, table

ensemble.py 文件源码项目：5th_place_solution_facebook_check_ins 作者: aikinogard 项目源码文件源码阅读 35 收藏 0 点赞 0 评论 0

def folderToDict(model_output_path, c, probas=None):
    logging.info("merge folder %s" % model_output_path)
    files = glob.glob(os.path.join(model_output_path, "*.csv"))
    if probas is None:
        probas = defaultdict(lambda: defaultdict(float))
    for f in files:
        logging.info("loading... %s" % f)
        df = pd.read_csv(f, dtype={"row_id": int, "place_id": int, "proba": float})
        for i in range(len(df)):
            probas[df["row_id"][i]][df["place_id"][i]] += c * df["proba"][i]
    try:
        with open(os.path.join(model_output_path, "map3.txt"), "r") as f_score:
            logging.info("map3=%6.6f" % float(f_score.read()))
    except:
        pass
    return probas

ensemble.py 文件源码项目：5th_place_solution_facebook_check_ins 作者: aikinogard 项目源码文件源码阅读 34 收藏 0 点赞 0 评论 0

def parseDict(probas, output_name, valid_file=None):
    df = pd.DataFrame()
    df["row_id"] = probas.keys()
    df["place_id"] = df["row_id"].apply(lambda x: map(itemgetter(0),
                        sorted(probas[x].items(), key=itemgetter(1), reverse=True)[:3]))
    if valid_file is not None:
        df_valid = pd.read_csv(valid_file, usecols=["row_id", "place_id"])
        df_valid.rename(columns={"place_id": "place_id_label"}, inplace=True)
        df_merge = pd.merge(df, df_valid, how="left", on="row_id")
        valid_score = metrics.mapk(df_merge.place_id_label.values[:, None],
                                   df_merge.place_id.values, 3)
        logging.info("total validation score: %f" % valid_score)
        np.savetxt("%s.txt" % output_name, [valid_score], fmt="%f")
        del df_valid
        del df_merge

    df["place_id"] = df["place_id"].apply(lambda x: " ".join(map(str, x)))
    df.to_csv("%s.csv" % output_name, index=False)

ensemble_add.py 文件源码项目：5th_place_solution_facebook_check_ins 作者: aikinogard 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def folderToDict(model_output_path, c=1., probas=None):
    logging.info("merge folder %s" % model_output_path)
    files = glob.glob(os.path.join(model_output_path, "*.csv"))
    if probas is None:
        probas = defaultdict(lambda: defaultdict(float))
    for f in files:
        logging.info("loading... %s" % f)
        df = pd.read_csv(f, dtype={"row_id": int, "place_id": int, "proba": float})
        for i in range(len(df)):
            probas[df["row_id"][i]][df["place_id"][i]] += c * df["proba"][i]
    try:
        with open(os.path.join(model_output_path, "map3.txt"), "r") as f_score:
            logging.info("map3=%6.6f" % float(f_score.read()))
    except:
        pass
    return probas

raters_merge.py 文件源码项目：mriqc 作者: poldracklab 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def main():
    opts = get_parser().parse_args()

    rater_1 = pd.read_csv(opts.rater_1)[['participant_id', 'check-1']]
    rater_2 = pd.read_csv(opts.rater_2)[['participant_id', 'check-1']]

    rater_1.columns = ['participant_id', 'rater_1']
    rater_2.columns = ['participant_id', 'rater_2']
    merged = pd.merge(rater_1, rater_2, on='participant_id', how='outer')

    idcol = 'participant_id'
    if opts.mapping_file:
        idcol = 'subject_id'
        name_mapping = pd.read_csv(
            opts.mapping_file, sep=' ', header=None, usecols=[0, 1])
        name_mapping.columns = ['subject_id', 'participant_id']
        name_mapping['participant_id'] = name_mapping.participant_id.astype(str) + '.gif'
        merged = pd.merge(name_mapping, merged, on='participant_id', how='outer')

    merged[[idcol, 'rater_1', 'rater_2']].sort_values(by=idcol).to_csv(opts.output, index=False)

consensus.py 文件源码项目：Comparative-Annotation-Toolkit 作者: ComparativeGenomicsToolkit 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def calculate_improvement_metrics(final_consensus, scored_df, tm_eval_df, hgm_df, metrics):
    """For coding transcripts, how much did we improve the metrics?"""
    tm_df = tm_eval_df.reset_index()[['TransMapOriginalIntronsPercent', 'TranscriptId']]
    hgm_df_subset = hgm_df[hgm_df['AlignmentId'].apply(tools.nameConversions.aln_id_is_transmap)]
    hgm_df_subset = hgm_df_subset[['TranscriptId', 'IntronAnnotSupportPercent', 'IntronRnaSupportPercent']]
    tm_df = pd.merge(tm_df, hgm_df_subset, on='TranscriptId')
    df = pd.merge(tm_df, scored_df.reset_index(), on='TranscriptId', suffixes=['TransMap', ''])
    df = df.drop_duplicates(subset='AlignmentId')  # why do I need to do this?
    df = df.set_index('AlignmentId')
    metrics['Evaluation Improvement'] = {'changes': [], 'unchanged': 0}
    for aln_id, c in final_consensus:
        if c['transcript_biotype'] != 'protein_coding':
            continue
        if 'transMap' in c['transcript_modes']:
            metrics['Evaluation Improvement']['unchanged'] += 1
            continue
        tx_s = df.ix[aln_id]
        metrics['Evaluation Improvement']['changes'].append([tx_s.TransMapOriginalIntronsPercent,
                                                             tx_s.IntronAnnotSupportPercentTransMap,
                                                             tx_s.IntronRnaSupportPercentTransMap,
                                                             tx_s.OriginalIntronsPercent_mRNA,
                                                             tx_s.IntronAnnotSupportPercent,
                                                             tx_s.IntronRnaSupportPercent,
                                                             tx_s.TransMapGoodness,
                                                             tx_s.AlnGoodness_mRNA])

data.py 文件源码项目：DREAM 作者: LaceyChen17 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def get_users_orders(self, prior_or_train):
        '''
            get users' prior detailed orders
        '''
        if os.path.exists(self.cache_dir + 'users_orders.pkl'):
            with open(self.cache_dir + 'users_orders.pkl', 'rb') as f:
                users_orders = pickle.load(f)
        else:
            orders = self.get_orders()
            order_products_prior = self.get_orders_items(prior_or_train)
            users_orders = pd.merge(order_products_prior, orders[['user_id', 'order_id', 'order_number', 'days_up_to_last']], 
                        on = ['order_id'], how = 'left')
            with open(self.cache_dir + 'users_orders.pkl', 'wb') as f:
                pickle.dump(users_orders, f, pickle.HIGHEST_PROTOCOL)
        return users_orders