python类read_pickle()的实例源码

io.py 文件源码 项目:sportsball 作者: jgershen 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def combine_dataframe_into_pickle_file(dataframe, outfile, overwrite=False):
  """
  Save the provided pandas dataframe as a pickle to the provided file path. If a file is already present at that
  location, unpickle it, combine the dataframes, and save the result as a pickle (overwriting the file but keeping the
  data). Uses combine_first, prioritizing new data but keeping data from before.
  Obviously this will blow up catastrophically if there is a file at outfile which is not a DataFrame, and the data
  will get super gross if it *is* a DataFrame but the indices do not match.
  :param pandas.DataFrame dataframe: input dataframe
  :param str outfile: output file
  :return None:
  """
  if os.path.exists(outfile) and not overwrite:
    target_df = pandas.read_pickle(outfile)
    merged_df = dataframe.combine_first(target_df)
    merged_df.to_pickle(outfile)
  else:
    dataframe.to_pickle(outfile)
day_data_process.py 文件源码 项目:fx 作者: TaRyu 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def process(file_in=PATH_FILE_IN, file_out=PATH_FILE_FINAL):
    # data = pd.read_csv(file_in, dtype='str')
    # data['DateTime'] = pd.to_datetime(
    #     data['<DTYYYYMMDD>'].map(str) + data['<TIME>'].map(str),
    #     format='%Y%m%d%H%M%S')
    # data = data.set_index('DateTime')
    # data = pd.Series(data['<CLOSE>']).map(float)
    # data = data.resample('M').fillna(method='pad')
    # data = preprocessing.minmax_scale(data)
    # data_t = data[6:]
    # data_f = data.reshape(-1, 6)
    # data_f = np.array([data[i:i + 6] for i in range(data.shape[0] - 6 + 1)])
    # np.save(file_out[0], data_f[:len(data_f) - 1])
    # np.save(file_out[1], data_t)
    data = preprocessing.minmax_scale(pd.read_pickle(
        file_in)['close'])
    data = data.reshape(-1, 24)
    data_m = np.array([[data[i + x][0] for x in range(5)]
                       for i in range(len(data) - 5 + 1)])
    data_m = data_m.reshape(-1, 5)
    data_s = np.array([data[i + 5][0]
                       for i in range(len(data) - 5)])
    np.save(file_out[0], data_m[:len(data_m) - 1])
    np.save(file_out[1], data_s)
data_features.py 文件源码 项目:fx 作者: TaRyu 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def get_fs_t_5(file_in, file_out, i):
    data = pd.read_pickle(file_in)['close']
    data = data.reshape(-1, 24)
    data = np.float32([[data[i + x][-1] for
                        x in range(5 * i) if x % i == 0]
                       for i in range(len(data) - 5 * i + 1)])
    data = data.reshape(-1, 5)
    data_t = {
        'change': np.float32(
            [(data[i + i][-1] - data[i + i][0]) /
             data[i + i][0] * 100
             for i in range(data.shape[0] - i)]),
        'target_open': np.float32([data[i + i][0]
                                   for i in range(data.shape[0] - i)]),
        'real_target': np.float32([data[i + i][-1]
                                   for i in range(data.shape[0] - i)])
    }
    data_t = pd.DataFrame(data_t)
    np.save(file_out[0], data[:len(data) - i])
    data_t.to_pickle(file_out[1])
new_month_data_process.py 文件源码 项目:fx 作者: TaRyu 项目源码 文件源码 阅读 52 收藏 0 点赞 0 评论 0
def process(file_in=PATH_FILE_IN, file_out=PATH_FILE_FINAL):
    # data = pd.read_csv(file_in, dtype='str')
    # data['DateTime'] = pd.to_datetime(
    #     data['<DTYYYYMMDD>'].map(str) + data['<TIME>'].map(str),
    #     format='%Y%m%d%H%M%S')
    # data = data.set_index('DateTime')
    # data = pd.Series(data['<CLOSE>']).map(float)
    # data = data.resample('M').fillna(method='pad')
    # data = preprocessing.minmax_scale(data)
    # data_t = data[6:]
    # data_f = data.reshape(-1, 6)
    # data_f = np.array([data[i:i + 6] for i in range(data.shape[0] - 6 + 1)])
    # np.save(file_out[0], data_f[:len(data_f) - 1])
    # np.save(file_out[1], data_t)
    data = preprocessing.minmax_scale(pd.read_pickle(
        file_in)['close'])
    data_m = np.array([[data[i + x * 24 * 24] for x in range(6)]
                       for i in range(len(data) - 6 * 24 * 24 + 1)])
    data_m = data_m.reshape(-1, 6)
    data_s = np.array([data[i + 6 * 24 * 24]
                       for i in range(len(data) - 6 * 24 * 24)])
    np.save(file_out[0], data_m[:len(data_m) - 1])
    np.save(file_out[1], data_s)
prepare_data.py 文件源码 项目:VQA 作者: VedantYadav 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def get_answers_matrix(split):
    if split == 'train':
        data_path = 'data/train_qa'
    elif split == 'val':
        data_path = 'data/val_qa'
    else:
        print('Invalid split!')
        sys.exit()

    df = pd.read_pickle(data_path)
    answers = df[['multiple_choice_answer']].values.tolist()
    answer_matrix = np.zeros((len(answers),1001))
    default_onehot = np.zeros(1001)
    default_onehot[1000] = 1.0

    for i, answer in enumerate(answers):
        answer_matrix[i] = answer_to_onehot_dict.get(answer[0].lower(),default_onehot)

    return answer_matrix
prepare_data.py 文件源码 项目:VQA 作者: VedantYadav 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def get_questions_matrix(split):
    if split == 'train':
        data_path = 'data/train_qa'
    elif split == 'val':
        data_path = 'data/val_qa'
    else:
        print('Invalid split!')
        sys.exit()

    df = pd.read_pickle(data_path)
    questions = df[['question']].values.tolist()
    word_idx = ebd.load_idx()
    seq_list = []

    for question in questions:
        words = word_tokenize(question[0])
        seq = []
        for word in words:
            seq.append(word_idx.get(word,0))
        seq_list.append(seq)
    question_matrix = pad_sequences(seq_list)

    return question_matrix
income_db.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def _build(self,flags,files):
        path = flags.input_path
        Table = namedtuple('Table', 'name fname dtype')
        fnames = "adult.data,adult.test".split(',')
        names = "train,test".split(',')
        TABLES = [Table(i,"%s/%s"%(path,j),None) for i,j in zip(names,fnames) if files =="all" or i in files]

        print()
        self.flags = flags
        path = flags.data_path
        data = {}
        columns = [
            "age", "workclass", "fnlwgt", "education", "education_num",
            "marital_status", "occupation", "relationship", "race", "gender",
            "capital_gain", "capital_loss", "hours_per_week", "native_country",
            "income_bracket"
        ]

        for table in TABLES:
            name = table.name
            fname = table.fname
            dtype = table.dtype
            pname = "%s/%s.pkl"%(path,name.split('/')[-1].split('.')[0])
            if os.path.exists(pname):
                data[name] = pd.read_pickle(pname)
            else:
                if name == 'train':
                    data[name] = pd.read_csv(fname,dtype=dtype,header=None,skipinitialspace=True,
                        names=columns)
                if name == 'test':
                    data[name] = pd.read_csv(fname,dtype=dtype,header=None,skipinitialspace=True,
                        skiprows=1,names=columns)
                data[name]['target'] = data[name]["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
                data[name].drop('income_bracket',axis=1,inplace=True)
                data[name].to_pickle(pname)
            print_mem_time("Loaded {} {}".format(fname.split('/')[-1],data[name].shape))
        self.data = data # no copy, pass the inference
        print()
xgb.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def read_data(name):
    train_pk = name.replace('.csv','.pkl')
    if os.path.exists(train_pk) == False:
        train = pd.read_csv(name)
        if "va" not in name and "test" not in name:
            train.to_pickle(train_pk)
    else:
        train = pd.read_pickle(train_pk)
    return train
data.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def _load_u2o(self):
        if self.u2o:
            return
        path = self.flags.data_path
        p = "%s/u2o.pkl"%path
        if os.path.exists(p)==False:
            self._load_db()        
            u2o = self.pdDB.data['orders'].groupby('user_id')['order_id'].apply(list) 
            u2o.to_pickle(p)
        else:
            u2o = pd.read_pickle(p)
        self.u2o = u2o
        print_mem_time("Loaded u2o %d"%len(u2o))
personal_db.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def _build(self,flags,files):
        fnames,names = self.fnames,self.names
        path = self.path 
        Table = namedtuple('Table', 'name fname dtype')
        tables = [Table(i,"%s/%s"%(path,j),{}) for i,j in zip(names,fnames) if files =="all" or i in files]

        print()
        self.flags = flags
        path = flags.data_path
        data = {}
        for table in tables:
            name,fname,dtype = table.name,table.fname,table.dtype
            pname = "%s/%s_%s.pkl"%(path,self.name,name.split('/')[-1].split('.')[0])
            if os.path.exists(pname):
                data[name] = pd.read_pickle(pname)
            else: 
                if '_text' in name:              
                    data[name] = pd.read_csv(fname,header=None,sep="\|\|",skiprows=1,names=['ID','Text']) 
                else:
                    data[name] = pd.read_csv(fname)
                data[name].to_pickle(pname)
            print_mem_time("Loaded {} {}".format(fname.split('/')[-1],data[name].shape))
        self.data = data # no copy, pass the reference
        if "training_variants" in self.data:
            y = self.data["training_variants"]['Class']-1
            from utils.np_utils.encoder import onehot_encode
            self.y = onehot_encode(y,self.flags.classes)
        print()
scrub_rescrape.py 文件源码 项目:glassdoor-analysis 作者: THEdavehogue 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def combine_data(paths):
    '''
    Function to combine dataframes from pickled form

    INPUT:
        paths: Iterable of filepaths for pickled DataFrames

    OUTPUT:
        ratings_df: Single pandas DataFrame with all ratings
    '''
    ratings_df = pd.read_pickle(paths[0])
    for path in paths[1:]:
        ratings_df = ratings_df.append(pd.read_pickle(path))
    return ratings_df
scrub_rescrape.py 文件源码 项目:glassdoor-analysis 作者: THEdavehogue 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def check_review_counts(ratings_df):
    '''
    Function to check that enough data was collected. Compares number of reviews
    for each target employer with the number of reviews collected

    INPUT:
        ratings_df: Pandas DataFrame containing scraped review text

    OUTPUT:
        good_er_ids, bad_er_ids: Lists of tuples to rescrape from glassdoor
    '''
    clean_df = pd.read_pickle(os.path.join('data', 'clean_employers.pkl'))
    target_ratings = clean_df[['company_name', 'company_id',
                               'num_ratings', 'overall_rating']]
    company_ratings = ratings_df['company_name'].value_counts()
    company_ratings = company_ratings.to_frame(name='ratings_collected')
    company_ratings.reset_index(inplace=True)
    check_df = target_ratings.merge(company_ratings,
                                    how='left',
                                    left_on='company_name',
                                    right_on='index')
    check_df['company_id'] = check_df['company_id'].astype(int)
    check_df.drop('index', axis=1, inplace=True)
    check_df['delta'] = check_df['num_ratings'] - check_df['ratings_collected']
    check_df['delta_pct'] = check_df['delta'] / check_df['num_ratings']
    rescrape = check_df[check_df['delta_pct'] > 0.5]
    good_rescrape = rescrape[rescrape['overall_rating'] > 3.5]
    bad_rescrape = rescrape[rescrape['overall_rating'] < 3.5]
    good_er_ids = zip(good_rescrape['company_name'],
                      good_rescrape['company_id'])
    bad_er_ids = zip(bad_rescrape['company_name'], bad_rescrape['company_id'])
    pickle.dump(good_er_ids,
                open(os.path.join('data', 'rescrape_pros.pkl'), 'wb'))
    pickle.dump(bad_er_ids,
                open(os.path.join('data', 'rescrape_cons.pkl'), 'wb'))
    return good_er_ids, bad_er_ids
__main__.py 文件源码 项目:InplusTrader_Linux 作者: zhengwsh 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def plot(result_dict_file, is_show, plot_save_file):
    """
    Draw result DataFrame
    """
    import pandas as pd
    from rqalpha.plot import plot_result

    result_dict = pd.read_pickle(result_dict_file)
    if is_show:
        plot_result(result_dict)
    if plot_save_file:
        plot_result(result_dict, show_windows=False, savefile=plot_save_file)
__main__.py 文件源码 项目:InplusTrader_Linux 作者: zhengwsh 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def report(result_pickle_file_path, target_report_csv_path):
    """
    Generate report from backtest output file
    """
    import pandas as pd
    result_dict = pd.read_pickle(result_pickle_file_path)

    from rqalpha.utils.report import generate_report
    generate_report(result_dict, target_report_csv_path)
__init__.py 文件源码 项目:InplusTrader_Linux 作者: zhengwsh 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def plot(result_dict_file, show, plot_save_file):
    """
    [sys_analyser] draw result DataFrame
    """
    import pandas as pd
    from .plot import plot_result

    result_dict = pd.read_pickle(result_dict_file)
    plot_result(result_dict, show, plot_save_file)
__init__.py 文件源码 项目:InplusTrader_Linux 作者: zhengwsh 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def report(result_pickle_file_path, target_report_csv_path):
    """
    [sys_analyser] Generate report from backtest output file
    """
    import pandas as pd
    result_dict = pd.read_pickle(result_pickle_file_path)

    from .report import generate_report
    generate_report(result_dict, target_report_csv_path)
matplotlib_charting.py 文件源码 项目:seniority_list 作者: rubydatasystems 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def display_proposals():
    '''print out a list of the proposal names which were generated and stored
    in the dill folder by the build_program_files script

    no inputs
    '''
    print('proposal list:')
    print(list(pd.read_pickle('dill/proposal_names.pkl').proposals))
vis.py 文件源码 项目:FHDMM 作者: aweinstein 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def plot_actions(cue=0):
    mpl.rcParams['axes.labelsize'] = 'large'
    d_map = {3:1, 8:2, 14:3, 23:4}
    df = pd.read_pickle('data.pkl').reset_index()
    df = df.loc[df['cue'] == cue]
    g = sns.FacetGrid(df, col='subject',
                      col_wrap=6, size=1.5, ylim=(0, 5), aspect=1.5)

    g.map(plt.plot, 'action')
    g.set(xticks=[], yticks=[0,1,2,3], yticklabels=['3', '8', '14', '23'])
    g.set(ylim=(-0.5, 4))
    g.set_ylabels('choice')
    g.fig.tight_layout()
    g.fig.subplots_adjust(top=0.93)


    subjects = df['subject'].unique()
    for ax, subject in zip(g.axes, subjects):
        df_subject = df.loc[df['subject'] == subject]
        df_subject.reset_index(inplace=True)
        df_wins = df_subject.loc[df_subject['reward'] > 0]
        df_lose = df_subject.loc[df_subject['reward'] < 0]
        pos_win = df_wins.loc[df_wins['subject'] == subject].index
        pos_lose = df_lose.loc[df_lose['subject'] == subject].index
        ax.eventplot(pos_win, lineoffsets=3.5, linelength=0.75,
                     linewidths=0.4)
        ax.eventplot(pos_lose, lineoffsets=3.5, linelength=0.75,
                     color='r', linewidths=0.4)
    plt.tight_layout()
    plt.savefig('actions_0.pdf')
    plt.show()
    globals().update(locals())
log.py 文件源码 项目:birdsong-keras 作者: bapalto 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def appendDfToPickle(df, filePath):
    import os
    import pandas as pd
    if not os.path.isfile(filePath):
        df.to_pickle(filePath)
    else:
        tempDF=pd.read_pickle(filePath)
        tempDF=tempDF.append(df, ignore_index = True)
        tempDF.to_pickle(filePath)
UtilityFunctions.py 文件源码 项目:chainladder-python 作者: jbogaardt 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def load_dataset(key):
    """ Function to load datasets included in the chainladder package.

    Arguments:
    key: str
        The name of the dataset, e.g. RAA, ABC, UKMotor, GenIns, etc.

    Returns:
    pandas.DataFrame of the loaded dataset.
   """
    path = os.path.dirname(os.path.abspath(__file__))
    return read_pickle(os.path.join(path, 'data', key))


问题


面经


文章

微信
公众号

扫码关注公众号