def combine_dataframe_into_pickle_file(dataframe, outfile, overwrite=False):
"""
Save the provided pandas dataframe as a pickle to the provided file path. If a file is already present at that
location, unpickle it, combine the dataframes, and save the result as a pickle (overwriting the file but keeping the
data). Uses combine_first, prioritizing new data but keeping data from before.
Obviously this will blow up catastrophically if there is a file at outfile which is not a DataFrame, and the data
will get super gross if it *is* a DataFrame but the indices do not match.
:param pandas.DataFrame dataframe: input dataframe
:param str outfile: output file
:return None:
"""
if os.path.exists(outfile) and not overwrite:
target_df = pandas.read_pickle(outfile)
merged_df = dataframe.combine_first(target_df)
merged_df.to_pickle(outfile)
else:
dataframe.to_pickle(outfile)
python类read_pickle()的实例源码
def process(file_in=PATH_FILE_IN, file_out=PATH_FILE_FINAL):
# data = pd.read_csv(file_in, dtype='str')
# data['DateTime'] = pd.to_datetime(
# data['<DTYYYYMMDD>'].map(str) + data['<TIME>'].map(str),
# format='%Y%m%d%H%M%S')
# data = data.set_index('DateTime')
# data = pd.Series(data['<CLOSE>']).map(float)
# data = data.resample('M').fillna(method='pad')
# data = preprocessing.minmax_scale(data)
# data_t = data[6:]
# data_f = data.reshape(-1, 6)
# data_f = np.array([data[i:i + 6] for i in range(data.shape[0] - 6 + 1)])
# np.save(file_out[0], data_f[:len(data_f) - 1])
# np.save(file_out[1], data_t)
data = preprocessing.minmax_scale(pd.read_pickle(
file_in)['close'])
data = data.reshape(-1, 24)
data_m = np.array([[data[i + x][0] for x in range(5)]
for i in range(len(data) - 5 + 1)])
data_m = data_m.reshape(-1, 5)
data_s = np.array([data[i + 5][0]
for i in range(len(data) - 5)])
np.save(file_out[0], data_m[:len(data_m) - 1])
np.save(file_out[1], data_s)
def get_fs_t_5(file_in, file_out, i):
data = pd.read_pickle(file_in)['close']
data = data.reshape(-1, 24)
data = np.float32([[data[i + x][-1] for
x in range(5 * i) if x % i == 0]
for i in range(len(data) - 5 * i + 1)])
data = data.reshape(-1, 5)
data_t = {
'change': np.float32(
[(data[i + i][-1] - data[i + i][0]) /
data[i + i][0] * 100
for i in range(data.shape[0] - i)]),
'target_open': np.float32([data[i + i][0]
for i in range(data.shape[0] - i)]),
'real_target': np.float32([data[i + i][-1]
for i in range(data.shape[0] - i)])
}
data_t = pd.DataFrame(data_t)
np.save(file_out[0], data[:len(data) - i])
data_t.to_pickle(file_out[1])
def process(file_in=PATH_FILE_IN, file_out=PATH_FILE_FINAL):
# data = pd.read_csv(file_in, dtype='str')
# data['DateTime'] = pd.to_datetime(
# data['<DTYYYYMMDD>'].map(str) + data['<TIME>'].map(str),
# format='%Y%m%d%H%M%S')
# data = data.set_index('DateTime')
# data = pd.Series(data['<CLOSE>']).map(float)
# data = data.resample('M').fillna(method='pad')
# data = preprocessing.minmax_scale(data)
# data_t = data[6:]
# data_f = data.reshape(-1, 6)
# data_f = np.array([data[i:i + 6] for i in range(data.shape[0] - 6 + 1)])
# np.save(file_out[0], data_f[:len(data_f) - 1])
# np.save(file_out[1], data_t)
data = preprocessing.minmax_scale(pd.read_pickle(
file_in)['close'])
data_m = np.array([[data[i + x * 24 * 24] for x in range(6)]
for i in range(len(data) - 6 * 24 * 24 + 1)])
data_m = data_m.reshape(-1, 6)
data_s = np.array([data[i + 6 * 24 * 24]
for i in range(len(data) - 6 * 24 * 24)])
np.save(file_out[0], data_m[:len(data_m) - 1])
np.save(file_out[1], data_s)
def get_answers_matrix(split):
if split == 'train':
data_path = 'data/train_qa'
elif split == 'val':
data_path = 'data/val_qa'
else:
print('Invalid split!')
sys.exit()
df = pd.read_pickle(data_path)
answers = df[['multiple_choice_answer']].values.tolist()
answer_matrix = np.zeros((len(answers),1001))
default_onehot = np.zeros(1001)
default_onehot[1000] = 1.0
for i, answer in enumerate(answers):
answer_matrix[i] = answer_to_onehot_dict.get(answer[0].lower(),default_onehot)
return answer_matrix
def get_questions_matrix(split):
if split == 'train':
data_path = 'data/train_qa'
elif split == 'val':
data_path = 'data/val_qa'
else:
print('Invalid split!')
sys.exit()
df = pd.read_pickle(data_path)
questions = df[['question']].values.tolist()
word_idx = ebd.load_idx()
seq_list = []
for question in questions:
words = word_tokenize(question[0])
seq = []
for word in words:
seq.append(word_idx.get(word,0))
seq_list.append(seq)
question_matrix = pad_sequences(seq_list)
return question_matrix
def _build(self,flags,files):
path = flags.input_path
Table = namedtuple('Table', 'name fname dtype')
fnames = "adult.data,adult.test".split(',')
names = "train,test".split(',')
TABLES = [Table(i,"%s/%s"%(path,j),None) for i,j in zip(names,fnames) if files =="all" or i in files]
print()
self.flags = flags
path = flags.data_path
data = {}
columns = [
"age", "workclass", "fnlwgt", "education", "education_num",
"marital_status", "occupation", "relationship", "race", "gender",
"capital_gain", "capital_loss", "hours_per_week", "native_country",
"income_bracket"
]
for table in TABLES:
name = table.name
fname = table.fname
dtype = table.dtype
pname = "%s/%s.pkl"%(path,name.split('/')[-1].split('.')[0])
if os.path.exists(pname):
data[name] = pd.read_pickle(pname)
else:
if name == 'train':
data[name] = pd.read_csv(fname,dtype=dtype,header=None,skipinitialspace=True,
names=columns)
if name == 'test':
data[name] = pd.read_csv(fname,dtype=dtype,header=None,skipinitialspace=True,
skiprows=1,names=columns)
data[name]['target'] = data[name]["income_bracket"].apply(lambda x: ">50K" in x).astype(int)
data[name].drop('income_bracket',axis=1,inplace=True)
data[name].to_pickle(pname)
print_mem_time("Loaded {} {}".format(fname.split('/')[-1],data[name].shape))
self.data = data # no copy, pass the inference
print()
def read_data(name):
train_pk = name.replace('.csv','.pkl')
if os.path.exists(train_pk) == False:
train = pd.read_csv(name)
if "va" not in name and "test" not in name:
train.to_pickle(train_pk)
else:
train = pd.read_pickle(train_pk)
return train
def _load_u2o(self):
if self.u2o:
return
path = self.flags.data_path
p = "%s/u2o.pkl"%path
if os.path.exists(p)==False:
self._load_db()
u2o = self.pdDB.data['orders'].groupby('user_id')['order_id'].apply(list)
u2o.to_pickle(p)
else:
u2o = pd.read_pickle(p)
self.u2o = u2o
print_mem_time("Loaded u2o %d"%len(u2o))
def _build(self,flags,files):
fnames,names = self.fnames,self.names
path = self.path
Table = namedtuple('Table', 'name fname dtype')
tables = [Table(i,"%s/%s"%(path,j),{}) for i,j in zip(names,fnames) if files =="all" or i in files]
print()
self.flags = flags
path = flags.data_path
data = {}
for table in tables:
name,fname,dtype = table.name,table.fname,table.dtype
pname = "%s/%s_%s.pkl"%(path,self.name,name.split('/')[-1].split('.')[0])
if os.path.exists(pname):
data[name] = pd.read_pickle(pname)
else:
if '_text' in name:
data[name] = pd.read_csv(fname,header=None,sep="\|\|",skiprows=1,names=['ID','Text'])
else:
data[name] = pd.read_csv(fname)
data[name].to_pickle(pname)
print_mem_time("Loaded {} {}".format(fname.split('/')[-1],data[name].shape))
self.data = data # no copy, pass the reference
if "training_variants" in self.data:
y = self.data["training_variants"]['Class']-1
from utils.np_utils.encoder import onehot_encode
self.y = onehot_encode(y,self.flags.classes)
print()
def combine_data(paths):
'''
Function to combine dataframes from pickled form
INPUT:
paths: Iterable of filepaths for pickled DataFrames
OUTPUT:
ratings_df: Single pandas DataFrame with all ratings
'''
ratings_df = pd.read_pickle(paths[0])
for path in paths[1:]:
ratings_df = ratings_df.append(pd.read_pickle(path))
return ratings_df
def check_review_counts(ratings_df):
'''
Function to check that enough data was collected. Compares number of reviews
for each target employer with the number of reviews collected
INPUT:
ratings_df: Pandas DataFrame containing scraped review text
OUTPUT:
good_er_ids, bad_er_ids: Lists of tuples to rescrape from glassdoor
'''
clean_df = pd.read_pickle(os.path.join('data', 'clean_employers.pkl'))
target_ratings = clean_df[['company_name', 'company_id',
'num_ratings', 'overall_rating']]
company_ratings = ratings_df['company_name'].value_counts()
company_ratings = company_ratings.to_frame(name='ratings_collected')
company_ratings.reset_index(inplace=True)
check_df = target_ratings.merge(company_ratings,
how='left',
left_on='company_name',
right_on='index')
check_df['company_id'] = check_df['company_id'].astype(int)
check_df.drop('index', axis=1, inplace=True)
check_df['delta'] = check_df['num_ratings'] - check_df['ratings_collected']
check_df['delta_pct'] = check_df['delta'] / check_df['num_ratings']
rescrape = check_df[check_df['delta_pct'] > 0.5]
good_rescrape = rescrape[rescrape['overall_rating'] > 3.5]
bad_rescrape = rescrape[rescrape['overall_rating'] < 3.5]
good_er_ids = zip(good_rescrape['company_name'],
good_rescrape['company_id'])
bad_er_ids = zip(bad_rescrape['company_name'], bad_rescrape['company_id'])
pickle.dump(good_er_ids,
open(os.path.join('data', 'rescrape_pros.pkl'), 'wb'))
pickle.dump(bad_er_ids,
open(os.path.join('data', 'rescrape_cons.pkl'), 'wb'))
return good_er_ids, bad_er_ids
def plot(result_dict_file, is_show, plot_save_file):
"""
Draw result DataFrame
"""
import pandas as pd
from rqalpha.plot import plot_result
result_dict = pd.read_pickle(result_dict_file)
if is_show:
plot_result(result_dict)
if plot_save_file:
plot_result(result_dict, show_windows=False, savefile=plot_save_file)
def report(result_pickle_file_path, target_report_csv_path):
"""
Generate report from backtest output file
"""
import pandas as pd
result_dict = pd.read_pickle(result_pickle_file_path)
from rqalpha.utils.report import generate_report
generate_report(result_dict, target_report_csv_path)
def plot(result_dict_file, show, plot_save_file):
"""
[sys_analyser] draw result DataFrame
"""
import pandas as pd
from .plot import plot_result
result_dict = pd.read_pickle(result_dict_file)
plot_result(result_dict, show, plot_save_file)
def report(result_pickle_file_path, target_report_csv_path):
"""
[sys_analyser] Generate report from backtest output file
"""
import pandas as pd
result_dict = pd.read_pickle(result_pickle_file_path)
from .report import generate_report
generate_report(result_dict, target_report_csv_path)
def display_proposals():
'''print out a list of the proposal names which were generated and stored
in the dill folder by the build_program_files script
no inputs
'''
print('proposal list:')
print(list(pd.read_pickle('dill/proposal_names.pkl').proposals))
def plot_actions(cue=0):
mpl.rcParams['axes.labelsize'] = 'large'
d_map = {3:1, 8:2, 14:3, 23:4}
df = pd.read_pickle('data.pkl').reset_index()
df = df.loc[df['cue'] == cue]
g = sns.FacetGrid(df, col='subject',
col_wrap=6, size=1.5, ylim=(0, 5), aspect=1.5)
g.map(plt.plot, 'action')
g.set(xticks=[], yticks=[0,1,2,3], yticklabels=['3', '8', '14', '23'])
g.set(ylim=(-0.5, 4))
g.set_ylabels('choice')
g.fig.tight_layout()
g.fig.subplots_adjust(top=0.93)
subjects = df['subject'].unique()
for ax, subject in zip(g.axes, subjects):
df_subject = df.loc[df['subject'] == subject]
df_subject.reset_index(inplace=True)
df_wins = df_subject.loc[df_subject['reward'] > 0]
df_lose = df_subject.loc[df_subject['reward'] < 0]
pos_win = df_wins.loc[df_wins['subject'] == subject].index
pos_lose = df_lose.loc[df_lose['subject'] == subject].index
ax.eventplot(pos_win, lineoffsets=3.5, linelength=0.75,
linewidths=0.4)
ax.eventplot(pos_lose, lineoffsets=3.5, linelength=0.75,
color='r', linewidths=0.4)
plt.tight_layout()
plt.savefig('actions_0.pdf')
plt.show()
globals().update(locals())
def appendDfToPickle(df, filePath):
import os
import pandas as pd
if not os.path.isfile(filePath):
df.to_pickle(filePath)
else:
tempDF=pd.read_pickle(filePath)
tempDF=tempDF.append(df, ignore_index = True)
tempDF.to_pickle(filePath)
def load_dataset(key):
""" Function to load datasets included in the chainladder package.
Arguments:
key: str
The name of the dataset, e.g. RAA, ABC, UKMotor, GenIns, etc.
Returns:
pandas.DataFrame of the loaded dataset.
"""
path = os.path.dirname(os.path.abspath(__file__))
return read_pickle(os.path.join(path, 'data', key))