def load_pkl():
'''
loads a pickled DataFrame with the employers to scrape ratings for.
INPUT:
None
OUTPUT:
df: pandas DataFrame
split: threshold of good/bad employer ratings
'''
df = pd.read_pickle(os.path.join('data', 'clean_employers.pkl'))
df['company_id'] = df['company_id'].astype(int)
df['num_ratings'] = df['num_ratings'].astype(int)
split = df['overall_rating'].mean()
return df, split
python类read_pickle()的实例源码
scrape_ratings_threaded.py 文件源码
项目:glassdoor-analysis
作者: THEdavehogue
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def _load_o2p(self):
if self.o2p:
return
path = self.flags.data_path
p = "%s/o2p.pkl"%path
if os.path.exists(p)==False:
self._load_db()
ops = self.pdDB.data['op_prior']
ops = ops.append(self.pdDB.data['op_train'])
o2p = ops.sort_values(['order_id', 'add_to_cart_order'])\
.groupby('order_id')['product_id'].apply(list)
o2p.to_pickle(p)
else:
o2p = pd.read_pickle(p)
self.o2p = o2p
print_mem_time("Loaded o2p %d"%len(o2p))
def compute_cell_smushing(self):
"""Within each plate, find a 2d embedding of all cells"""
grouped = self.genes.groupby(self.cell_metadata[self.SAMPLE_MAPPING])
if os.path.exists(self.cell_smushed_cache_file):
smusheds = pd.read_pickle(self.cell_smushed_cache_file)
# if nothing is missing, return the cached version
if not set(grouped.groups) - set(smusheds):
return smusheds
else:
smusheds = {}
for plate_name, genes_subset in grouped:
if plate_name not in smusheds:
cell_smusher = TSNE(metric='cosine', random_state=0)
cell_smushed = pd.DataFrame(
cell_smusher.fit_transform(genes_subset),
index=genes_subset.index)
smusheds[plate_name] = cell_smushed
pd.to_pickle(smusheds, self.cell_smushed_cache_file)
return smusheds
def fit_behavioral_data():
"""Fit a model for all subjects. """
df = pd.read_pickle('data.pkl')
subjects = df.index.get_level_values('subject').unique()
data = np.empty((subjects.size, 10))
cues = (0, 1)
for i, subject in enumerate(subjects):
print('Fitting model for subject {}'.format(subject))
df_s = df.loc[subject]
for cue in cues:
ml = ML(df_s[df_s['cue']==cue])
r = ml.ml_estimation()
data[i,2*cue:(2*cue+2)] = r.x
data[i,2*cue+4:2*cue+6] = np.sqrt(np.diag(r.hess_inv.todense()))
data[i,cue+8] = r.fun
model = pd.DataFrame(data, pd.Index(subjects, name='subject'),
['alpha_0', 'beta_0', 'alpha_1', 'beta_1',
'se_alpha_0', 'se_beta_0', 'se_alpha_1', 'se_beta_1',
'NLL_0', 'NLL_1'])
return model
def fit_single_subject(subject=4):
df = pd.read_pickle('data.pkl')
print('Fitting model for subject {}'.format(subject))
df_s = df.loc[subject]
cues = (0, 1, 2)
for cue in cues:
ml = ML(df_s[df_s['cue']==cue])
r = ml.ml_estimation()
H_inv = r.hess_inv.todense()
print('\t cue:{:d}'.format(cue))
print('\t\tr:\n\t\t\t{}\n'.format(r.x))
print('\tInverse of Hessian:\n{}\n'.format(H_inv))
globals().update(locals())
def _load_table(self, filepath):
"""
Load table from file system.
:param str filepath: Path to table in CSV, TSV, XLSX or
Pandas pickle format.
:return: Pandas table
:rtype: pandas.core.frame.DataFrame
"""
_, ext = os.path.splitext(filepath.lower())
if ext == '.tsv':
return pd.read_table(filepath, **self.kwargs)
if ext == '.csv':
return pd.read_csv(filepath, **self.kwargs)
if ext == '.xlsx':
return pd.read_excel(filepath, **self.kwargs)
return pd.read_pickle(filepath, **self.kwargs)
test_multi.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def test_legacy_pickle(self):
if PY3:
raise nose.SkipTest("testing for legacy pickles not "
"support on py3")
path = tm.get_data_path('multiindex_v1.pickle')
obj = pd.read_pickle(path)
obj2 = MultiIndex.from_tuples(obj.values)
self.assertTrue(obj.equals(obj2))
res = obj.get_indexer(obj)
exp = np.arange(len(obj))
assert_almost_equal(res, exp)
res = obj.get_indexer(obj2[::-1])
exp = obj.get_indexer(obj[::-1])
exp2 = obj2.get_indexer(obj2[::-1])
assert_almost_equal(res, exp)
assert_almost_equal(exp, exp2)
test_multi.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def test_legacy_v2_unpickle(self):
# 0.7.3 -> 0.8.0 format manage
path = tm.get_data_path('mindex_073.pickle')
obj = pd.read_pickle(path)
obj2 = MultiIndex.from_tuples(obj.values)
self.assertTrue(obj.equals(obj2))
res = obj.get_indexer(obj)
exp = np.arange(len(obj))
assert_almost_equal(res, exp)
res = obj.get_indexer(obj2[::-1])
exp = obj.get_indexer(obj[::-1])
exp2 = obj2.get_indexer(obj2[::-1])
assert_almost_equal(res, exp)
assert_almost_equal(exp, exp2)
test_categorical.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 34
收藏 0
点赞 0
评论 0
def test_pickle_v0_14_1(self):
# we have the name warning
# 10482
with tm.assert_produces_warning(UserWarning):
cat = pd.Categorical(values=['a', 'b', 'c'],
categories=['a', 'b', 'c', 'd'],
name='foobar', ordered=False)
pickle_path = os.path.join(tm.get_data_path(),
'categorical_0_14_1.pickle')
# This code was executed once on v0.14.1 to generate the pickle:
#
# cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
# name='foobar')
# with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
#
self.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
test_categorical.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def test_pickle_v0_15_2(self):
# ordered -> _ordered
# GH 9347
# we have the name warning
# 10482
with tm.assert_produces_warning(UserWarning):
cat = pd.Categorical(values=['a', 'b', 'c'],
categories=['a', 'b', 'c', 'd'],
name='foobar', ordered=False)
pickle_path = os.path.join(tm.get_data_path(),
'categorical_0_15_2.pickle')
# This code was executed once on v0.15.2 to generate the pickle:
#
# cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
# name='foobar')
# with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
#
self.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
test_pickle.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 36
收藏 0
点赞 0
评论 0
def compare(self, vf, version):
# py3 compat when reading py2 pickle
try:
data = pandas.read_pickle(vf)
except (ValueError) as e:
if 'unsupported pickle protocol:' in str(e):
# trying to read a py3 pickle in py2
return
else:
raise
for typ, dv in data.items():
for dt, result in dv.items():
try:
expected = self.data[typ][dt]
except (KeyError):
continue
# use a specific comparator
# if available
comparator = getattr(self, "compare_{typ}_{dt}".format(
typ=typ, dt=dt), self.compare_element)
comparator(result, expected, typ, version)
return data
def thunder():
if os.path.exists('../dataset/thunder.pkl'):
return pd.read_pickle('../dataset/thunder.pkl')
thunder_df = pd.read_csv('../input/thunder.csv',
names=[
'datetime', # ????
'lat', # ??(10??)
'lon', # ??(10??)
'type' # ???, CG: ???, IC: ???
])
# ?????????
thunder_df.datetime = pd.to_datetime(thunder_df.datetime)
# observation_point_df.to_pickle('../dataset/observation_point.pkl')
thunder_df = pd.concat([thunder_df, pd.get_dummies(thunder_df.type)], axis=1)
thunder_df.to_pickle('../dataset/thunder_df.pkl')
return thunder_df
def __init__(self, baseDir='../temp/repo'):
'''baseDir?????????'''
self.dir = baseDir
self.data = {}
if not os.path.exists(self.dir):
os.makedirs(self.dir)
logging.info('?????: %s'%self.dir)
#?????????????????????
for p in os.listdir(self.dir):
if os.path.isfile(
os.path.join(self.dir, p)):
key = re.split(r'.', p)[0]
path = os.path.join(self.dir, p)
t = pd.read_pickle(path)
logging.info('?%s???%s.'%(path, key))
self.data[key] = t
def read_models_from_dir(dir):
models = glob.glob(dir + '/*/')
selected_models = filter(lambda x: 'bag' not in x, models)
print selected_models
bagged_oobs = []
bagged_preds = []
for model in selected_models:
pred_file = model + '/' + 'preds.csv'
oob_file = model + '/' + 'oob.pkl'
oob = pd.read_pickle(oob_file)
preds = pd.read_csv(pred_file)
preds['ut_ms'] = pd.to_datetime(preds['ut_ms'], unit='ms')
preds=preds.set_index('ut_ms')
bagged_oobs.append(oob)
bagged_preds.append(preds)
return bagged_oobs, bagged_preds, selected_models
def read_models_from_dir(dir):
model_array = []
models = glob.glob(dir + '/*/')
selected_models = filter(lambda x: 'bag' not in x, models)
print selected_models
for model in selected_models:
try:
pred_file = model + '/' + 'preds.csv'
oob_file = model + '/' + 'oob.pkl'
oob = pd.read_pickle(oob_file)
cols = [model + str(i) for i in oob.columns]
print model, oob.shape
preds = pd.read_csv(pred_file)
preds['ut_ms'] = pd.to_datetime(preds['ut_ms'], unit='ms')
preds = preds.set_index('ut_ms')
model_array.append((Model(model, oob, preds, RMSE(target.loc[oob.index], oob))))
except:
print "Error! ", model
pass
return model_array
def load():
global user_order, goods, pname2id, model
user_order = pd.read_pickle('../input/mk/user_order.p')
goods = pd.read_pickle('../input/mk/goods.p')
pname2id = {}
for k,v in zip(goods.product_name, goods.product_id):
pname2id[k] = v
model = load_instacart_vec()
print('Activated utils.vec2pids, utils.pnames2ids')
return
def make(T):
"""
T = 0
folder = 'trainT-0'
"""
if T==-1:
folder = 'test'
else:
folder = 'trainT-'+str(T)
label = pd.read_pickle('../feature/{}/label_reordered.p'.format(folder))
df = pd.merge(label[['order_id', 'product_id']],
tbl[['order_id', 'product_id','days_since_last_order_this_item']],
on=['order_id', 'product_id'], how='left')
df.to_pickle('../feature/{}/f303_order-product.p'.format(folder))
#==============================================================================
# main
#==============================================================================
def concat_pred_item(T, dryrun=False):
if T==-1:
name = 'test'
else:
name = 'trainT-'+str(T)
df = utils.load_pred_item(name)
df = pd.merge(df, pd.read_pickle('../feature/{}/f317_user-product.p'.format(name)),
on=['user_id', 'product_id'],how='left')
gc.collect()
#==============================================================================
print('output')
#==============================================================================
if dryrun == True:
return df
else:
utils.to_pickles(df, '../feature/{}/all_apdx'.format(name), 20, inplace=True)
def trainModel(self):
df = pd.read_pickle("./train_features.pkl")
x_df = pd.concat([df.iloc[:,4:6],df.iloc[:,8]],axis=1)
y_df = df.iloc[:,9]
print(x_df)
print(len(x_df))
print(len(y_df))
train_no = int(0.8 * len(df))
#train_no = 100000
print(train_no)
train_df = x_df.iloc[0:train_no,:]
train_labels = y_df.iloc[0:train_no]
test_df = x_df.iloc[train_no:,:]
test_labels = y_df.iloc[train_no:]
self.model = LogisticClassifier(3)
self.model.trainModel(train_df,train_labels)
self.model.validateModel(test_df,test_labels)
def get_answers_matrix(split):
if split == 'train':
data_path = 'data/train_qa'
elif split == 'val':
data_path = 'data/val_qa'
else:
print('Invalid split!')
sys.exit()
df = pd.read_pickle(data_path)
answers = df[['multiple_choice_answer']].values.tolist()
answer_matrix = np.zeros((len(answers),1001))
default_onehot = np.zeros(1001)
default_onehot[1000] = 1.0
for i, answer in enumerate(answers):
answer_matrix[i] = answer_to_onehot_dict.get(answer[0].lower(),default_onehot)
return answer_matrix
def get_questions_matrix(split):
if split == 'train':
data_path = 'data/train_qa'
elif split == 'val':
data_path = 'data/val_qa'
else:
print('Invalid split!')
sys.exit()
df = pd.read_pickle(data_path)
questions = df[['question']].values.tolist()
word_idx = ebd.load_idx()
seq_list = []
for question in questions:
words = word_tokenize(question[0])
seq = []
for word in words:
seq.append(word_idx.get(word,0))
seq_list.append(seq)
question_matrix = pad_sequences(seq_list)
return question_matrix
def get_result_by_last_three_weeks_mean():
data = pd.read_pickle(static_params.DATA_PATH + 'user_pay_last_three_weeks.pkl')
result = pd.DataFrame(data['iid'])
date = '2016-11-'
index = 1
for index in range(1,8):
column = date + str(index)
result[column] = data.loc[:,['2016-10-' + str(index + 10),'2016-10-' + str(index + 17),'2016-10-' + str(index + 24)]].mean(1)
data2 = result.copy()
result = pd.merge(data2,result,on='iid')
result.iloc[:,-4] = result.iloc[:,-4]*1.2
result = result.astype(int)
result.to_csv(static_params.DATA_PATH + 'submission.csv',header=None,index=None)
def get_result_by_last_two_weeks_mean():
#???????????????
data = pd.read_pickle(static_params.DATA_PATH + 'user_pay_last_two_weeks.pkl')
print data
result = pd.DataFrame(data['iid'])
date = '2016-11-'
index = 1
for index in range(1,8):
column = date + str(index)
result[column] = data.loc[:,['2016-10-' + str(index + 17),'2016-10-' + str(index + 24)]].mean(1)
data2 = result.copy()
result = pd.merge(data2,result,on='iid').astype(int)
result.to_csv(static_params.DATA_PATH + 'submission.csv',header=None,index=None)
def user_view_split_by_shop():
if(not os.path.exists(static_params.DATA_USER_VIEW_BY_SHOP_PATH)):
os.mkdir(static_params.DATA_USER_VIEW_BY_SHOP_PATH)
data = pd.read_pickle(static_params.DATA_PATH + 'user_view.pkl')
print type(data)
data.columns = ['uid','iid','time']
print data
data['iid'] = data['iid'].astype(str)
data['time'] = data['time'].apply(get_date)
grouped = data.groupby(['iid'],as_index=False)
for name,group in grouped:
f = open(static_params.DATA_USER_VIEW_BY_SHOP_PATH + str(name) + '.pkl', 'wb')
cPickle.dump(group,f,-1)
f.close()
def get_extra_train():
##############################extra features##################################
train_simhash_features=pd.read_csv('data/extra_feature/train_simhash_features.csv')
train_selftrained_w2v_sim_dist=pd.read_pickle('data/extra_feature/train_selftrained_w2v_sim_dist.pkl')
train_selftrained_glove_sim_dist=pd.read_pickle('data/extra_feature/train_selftrained_glove_sim_dist.pkl')
train_pretrained_w2v_sim_dist=pd.read_pickle('data/extra_feature/train_pretrained_w2v_sim_dist.pkl')
train_distinct_word_stats_selftrained_glove=pd.read_csv('data/extra_feature/train_distinct_word_stats_selftrained_glove.csv')
train_distinct_word_stats_pretrained=pd.read_csv('data/extra_feature/train_distinct_word_stats_pretrained.csv')
train_distinct_word_stats=pd.read_csv('data/extra_feature/train_distinct_word_stats.csv')
X_train=np.hstack([train_simhash_features,
train_selftrained_w2v_sim_dist,
train_selftrained_glove_sim_dist,
train_pretrained_w2v_sim_dist,
train_distinct_word_stats_selftrained_glove,
train_distinct_word_stats_pretrained,
train_distinct_word_stats,])
print X_train.shape
return X_train
def get_extra_test():
##############################extra features##################################
test_simhash_features=pd.read_csv('data/extra_feature/test_simhash_features.csv')
test_selftrained_w2v_sim_dist=pd.read_pickle('data/extra_feature/test_selftrained_w2v_sim_dist.pkl')
test_selftrained_glove_sim_dist=pd.read_pickle('data/extra_feature/test_selftrained_glove_sim_dist.pkl')
test_pretrained_w2v_sim_dist=pd.read_pickle('data/extra_feature/test_pretrained_w2v_sim_dist.pkl')
test_distinct_word_stats_selftrained_glove=pd.read_csv('data/extra_feature/test_distinct_word_stats_selftrained_glove.csv')
test_distinct_word_stats_pretrained=pd.read_csv('data/extra_feature/test_distinct_word_stats_pretrained.csv')
test_distinct_word_stats=pd.read_csv('data/extra_feature/test_distinct_word_stats.csv')
X_test=np.hstack([ test_simhash_features,
test_selftrained_w2v_sim_dist,
test_selftrained_glove_sim_dist,
test_pretrained_w2v_sim_dist,
test_distinct_word_stats_selftrained_glove,
test_distinct_word_stats_pretrained,
test_distinct_word_stats,])
print X_test.shape
return X_test
def get_feature_importance(feature):
import scipy.stats as sps
import pandas as pd
y_train = pd.read_csv('../data/train.csv')['is_duplicate']
return sps.spearmanr(feature,y_train)[0]
# import pickle
# pickle.dump(X_train,open("data_train.pkl", 'wb'), protocol=2)
#
# data_file=['test_deptree','test_glove_sim_dist','test_pca_glove',
# 'test_pca_pattern','test_w2w','test_pos','test_pca_char']
#
# path='../test/'
# for it in range(6):
# tmp=[]
# flist=[item+str(it) for item in data_file]
# test=np.empty((400000,0))
# if it==5:
# test=np.empty((345796,0))
# for f in flist:
# test=np.hstack([test,pd.read_pickle(path+f+'.pkl')])
# pickle.dump(test,open('data_test{0}.pkl'.format(it),'wb'),protocol=2)
def split_cli():
p = ArgumentParser()
p.add_argument("expanded", default="expanded.pickle", help="Expanded pickle file targets.")
p.add_argument("stripped", default="test.pickle", help="stripped data filename")
p.add_argument("train", default="train.pickle", help="training filename")
p.add_argument("test", default="test.pickle", help="test filename")
p.add_argument("attrfile", default="attrs.txt", help="attrs to care about for NA purposes")
p.add_argument("--na-strategy", default="drop", help="what to do with NA rows (default is drop them)")
p.add_argument("--trainpct", default=70, type=int, help="percentage of data to put into training set")
p.add_argument("--random", action='store_true', help="split train/test sets randomly (default is by time)")
cfg = p.parse_args()
strip_and_process_to_files(expanded_file=pd.read_pickle(cfg.expanded),
stripped_file=cfg.stripped,
attrfile=cfg.attrfile,
na_strategy=cfg.na_strategy)
split_to_files(trainfile=cfg.train,
testfile=cfg.test,
stripped=cfg.stripped,
trainpct=cfg.trainpct,
split_randomly=cfg.random)
def load_nf_histplayerinfo(sport, identifiers_to_load):
"""
Load previously saved dataframes of numberfire prediction data.
:param str sport: which sport!
:param list[str] identifiers_to_load: id of players to load
:return dict[str, DataFrame]: dict of player -> prediction data for player
"""
loaded = 0
histplayerinfo_dict = {}
for identifier in identifiers_to_load:
target_file = get_histplayerinfo_filename(sport, identifier)
if os.path.exists(target_file):
histplayerinfo_dict[identifier] = pandas.read_pickle(target_file)
# Attempt to convert the index to time based if possible
if histplayerinfo_dict[identifier] is not None and 'date' in histplayerinfo_dict[identifier].columns:
histplayerinfo_dict[identifier].set_index('date', inplace=True)
loaded += 1
return histplayerinfo_dict
def load_nf_salaryinfo(sport, players):
"""
Load previously saved dataframes of numberfire salary data
:param list[str] players: players to load
:return dict[str, DataFrame]: dict of player -> salary data for player
"""
loaded = 0
player_dict = {}
for player in players:
target_file = get_salary_filename(sport, player)
if os.path.exists(target_file):
player_dict[player] = pandas.read_pickle(target_file)
# Attempt to convert the index to time based if possible
if player_dict[player] is not None and 'date' in player_dict[player].columns:
player_dict[player].set_index('date', inplace=True)
loaded += 1
return player_dict