def getStationMetadata():
'''
Retrieve metadata on groundwater wells
@return pandas dataframe with groundwater well information
'''
data_file = DataFetcher.getDataLocation('groundwater')
if data_file is None:
print('Dataset not available')
return None
store = pd.HDFStore(data_file,'r')
meta_data = store['meta_data']
store.close()
return meta_data
python类HDFStore()的实例源码
def getAntennaLogs():
'''
Retrieve information about antenna changes
@return dictionary of antenna changes
'''
store_location = data_util.getDataLocation('ngl_gps')
store = pd.HDFStore(store_location, 'r')
logs_df = store['ngl_steps']
store.close()
metadata = DataFetcher.getStationMetadata()
logs_dict = OrderedDict()
for station in metadata.index:
offset_dates = logs_df[logs_df['Station']==station].index.unique()
offset_dates = pd.Series(offset_dates)
logs_dict[station] = offset_dates
return logs_dict
def __init__(self, db_h5_file,
future_time = 20, lookback = 100, read_first_k_table = -1, normalize=True, two_class=False):
super(DBGeneticReader, self).__init__()
self._db = pd.HDFStore(db_h5_file)
self._future_time = future_time
self._lookback = lookback
self._db_len = 0
self._two_class = two_class
self._tables = []
for k in self._db:
self._db_len += len(self._db[k]) - future_time - lookback
t = self._db[k].iloc[:, 4:].astype('float32')
t['AveragePrice'] = (t['AskPrice1'] + t['BidPrice1']) / 2
if normalize:
t = (t - t.mean()) / (t.std() + 1e-10)
self._tables.append(t)
if read_first_k_table != -1 and len(self._tables) == read_first_k_table:
break
def _read_hdf_from_buffer(self, buffer):
with pandas.HDFStore(
"data.h5",
mode="r",
driver="H5FD_CORE",
driver_core_backing_store=0,
driver_core_image=buffer.read()) as store:
if len(store.keys()) > 1:
raise Exception('Ambiguous matrix store. More than one dataframe in the hdf file.')
try:
return store["matrix"]
except KeyError:
print("The hdf file should contain one and only key, matrix.")
return store[store.keys()[0]]
def fake_metta(matrix_dict, metadata):
"""Stores matrix and metadata in a metta-data-like form
Args:
matrix_dict (dict) of form { columns: values }.
Expects an entity_id to be present which it will use as the index
metadata (dict). Any metadata that should be set
Yields:
tuple of filenames for matrix and metadata
"""
matrix = pandas.DataFrame.from_dict(matrix_dict).set_index('entity_id')
with tempfile.NamedTemporaryFile() as matrix_file:
with tempfile.NamedTemporaryFile('w') as metadata_file:
hdf = pandas.HDFStore(matrix_file.name)
hdf.put('title', matrix, data_columns=True)
matrix_file.seek(0)
yaml.dump(metadata, metadata_file)
metadata_file.seek(0)
yield (matrix_file.name, metadata_file.name)
def fake_metta(matrix_dict, metadata):
"""Stores matrix and metadata in a metta-data-like form
Args:
matrix_dict (dict) of form { columns: values }.
Expects an entity_id to be present which it will use as the index
metadata (dict). Any metadata that should be set
Yields:
tuple of filenames for matrix and metadata
"""
matrix = pandas.DataFrame.from_dict(matrix_dict).set_index('entity_id')
with tempfile.NamedTemporaryFile() as matrix_file:
with tempfile.NamedTemporaryFile('w') as metadata_file:
hdf = pandas.HDFStore(matrix_file.name)
hdf.put('title', matrix, data_columns=True)
matrix_file.seek(0)
yaml.dump(metadata, metadata_file)
metadata_file.seek(0)
yield (matrix_file.name, metadata_file.name)
kagglegym.py 文件源码
项目:Two-Sigma-Financial-Modeling-Challenge
作者: xiaofeiwen
项目源码
文件源码
阅读 59
收藏 0
点赞 0
评论 0
def __init__(self):
with pd.HDFStore("../input/train.h5", "r") as hfdata:
self.timestamp = 0
fullset = hfdata.get("train")
self.unique_timestamp = fullset["timestamp"].unique()
# Get a list of unique timestamps
# use the first half for training and
# the second half for the test set
n = len(self.unique_timestamp)
i = int(n/2)
timesplit = self.unique_timestamp[i]
self.n = n
self.unique_idx = i
self.train = fullset[fullset.timestamp < timesplit]
self.test = fullset[fullset.timestamp >= timesplit]
# Needed to compute final score
self.full = self.test.loc[:, ['timestamp', 'y']]
self.full['y_hat'] = 0.0
self.temp_test_y = None
def hdfone_filenames(folder: str, path_to: str) -> List[str]:
filenames = []
try:
assert isinstance(folder, str), "folder isn't string: %s" % folder
assert isinstance(path_to, str), "path_to isn't string: %s" % path_to
if settings.DATA_TYPE == "hdfone":
f = join(settings.DATA_PATH, "hdfone.hdfone")
if isfile(f):
with HDFStore(f) as hdf:
filenames = [f for f in hdf.keys() if folder in f]
hdf.close()
else:
filenames = multi_filenames(path_to_history=path_to)
except Exception as err:
print(colored.red("hdfone_filenames: {}".format(err)))
return filenames
def write(self, frames):
"""
Write the frames to the target HDF5 file, using the format used by
``pd.Panel.to_hdf``
Parameters
----------
frames : iter[(int, DataFrame)] or dict[int -> DataFrame]
An iterable or other mapping of sid to the corresponding OHLCV
pricing data.
"""
with HDFStore(self._path, 'w',
complevel=self._complevel, complib=self._complib) \
as store:
panel = pd.Panel.from_dict(dict(frames))
panel.to_hdf(store, 'updates')
with tables.open_file(self._path, mode='r+') as h5file:
h5file.set_node_attr('/', 'version', 0)
def add_data(self, packet: DataPacket):
"""
Import a DataFrame into the project
:param packet: DataPacket custom class containing file path, dataframe, data type and flight association
:return: Void
"""
self.log.debug("Ingesting data and exporting to hdf5 store")
file_uid = 'f' + uuid.uuid4().hex[1:] # Fixes NaturalNameWarning by ensuring first char is letter ('f').
with HDFStore(str(self.hdf_path)) as store:
# Separate data into groups by data type (GPS & Gravity Data)
# format: 'table' pytables format enables searching/appending, fixed is more performant.
store.put('{}/{}'.format(packet.data_type, file_uid), packet.data, format='fixed', data_columns=True)
# Store a reference to the original file path
self.data_map[file_uid] = packet.path
try:
flight = self.flights[packet.flight.uid]
if packet.data_type == 'gravity':
flight.gravity = file_uid
elif packet.data_type == 'gps':
flight.gps = file_uid
except KeyError:
return False
def save_cache_file(data, cache_file, **kwargs):
"""Save minitree dataframe + cut history to a cache file
Any kwargs will be passed to pandas HDFStore. Defaults are:
complib='blosc'
complevel=9
"""
kwargs.setdefault('complib', 'blosc')
kwargs.setdefault('complevel', 9)
dirname = os.path.dirname(cache_file)
if dirname and not os.path.exists(dirname):
os.makedirs(dirname)
store = pd.HDFStore(cache_file, **kwargs)
store.put('data', data)
# Store the cuts history for the data
store.get_storer('data').attrs.cut_history = cuts._get_history(data)
store.close()
def __init__(self):
with pd.HDFStore("../input/train.h5", "r") as hfdata:
self.timestamp = 0
fullset = hfdata.get("train")
self.unique_timestamp = fullset["timestamp"].unique()
# Get a list of unique timestamps
# use the first half for training and
# the second half for the test set
n = len(self.unique_timestamp)
i = int(n/2)
timesplit = self.unique_timestamp[i]
self.n = n
self.unique_idx = i
self.train = fullset[fullset.timestamp < timesplit]
self.test = fullset[fullset.timestamp >= timesplit]
self.y_test_full = self.test['y'] # Just in case the full labels are needed later
self.y_pred_full = []
self.temp_test_y = None
self.ID_COL_NAME = 'id'
self.SAMPLE_COL_NAME = 'sample'
self.TARGET_COL_NAME = 'y'
self.TIME_COL_NAME = 'timestamp'
def _init_frame_node_parm_with_data(self):
"""
init pamr s need to be calculated
:return:s
"""
try :
store = pd.HDFStore(self.input_paths[0])
chunk = store.select('table1',
start=0,
stop=100)
for col_name in self.encode_col:
if (self.encode_len.get(col_name) == None):
if (chunk[col_name].dtype in ['int', 'float']):
self.encode_len[col_name] = 1
self.input_size = self.input_size + 1
else:
self.encode_len[col_name] = self.word_vector_size
self.input_size = self.input_size + self.word_vector_size
self.encode_onehot[col_name] = OneHotEncoder(self.word_vector_size)
self.encode_dtype[col_name] = str(chunk[col_name].dtype)
except Exception as e :
raise Exception ("error on wcnn feed parm prepare : {0}".format(e))
def _frame_parser(self, file_path, index):
"""
parse nlp data
:return:
"""
try :
store = pd.HDFStore(file_path)
chunk = store.select('table1',
start=index.start,
stop=index.stop)
input_vector = []
count = index.stop - index.start
for col_name in self.encode_col:
if (chunk[col_name].dtype == 'O'):
input_vector.append(list(map(lambda x: self.encode_onehot[col_name].get_vector(x),
chunk[col_name][0:count].tolist())))
else :
input_vector.append(np.array(list(map(lambda x: [self._filter_nan(x)], chunk[col_name][0:count].tolist()))))
return self._flat_data(input_vector, len(chunk[col_name][0:count].tolist()))
except Exception as e :
raise Exception (e)
finally:
store.close()
def _init_frame_node_parm_with_data(self):
"""
init pamr s need to be calculated
:return:s
"""
try :
store = pd.HDFStore(self.input_paths[0])
chunk = store.select('table1',
start=0,
stop=100)
for col_name in self.encode_col:
if (self.encode_len.get(col_name) == None):
if (chunk[col_name].dtype in ['int', 'float']):
self.encode_len[col_name] = 1
self.input_size = self.input_size + 1
else:
self.encode_len[col_name] = self.word_vector_size
self.input_size = self.input_size + self.word_vector_size
self.encode_onehot[col_name] = OneHotEncoder(self.word_vector_size)
self.encode_dtype[col_name] = str(chunk[col_name].dtype)
except Exception as e :
raise Exception ("error on wcnn feed parm prepare : {0}".format(e))
def _frame_parser(self, file_path, index):
"""
parse nlp data
:return:
"""
try :
store = pd.HDFStore(file_path)
chunk = store.select('table1',
start=index.start,
stop=index.stop)
input_vector = []
count = index.stop - index.start
for col_name in self.encode_col:
if (chunk[col_name].dtype == 'O'):
input_vector.append(list(map(lambda x: self.encode_onehot[col_name].get_vector(x),
chunk[col_name][0:count].tolist())))
else :
input_vector.append(np.array(list(map(lambda x: [self._filter_nan(x)], chunk[col_name][0:count].tolist()))))
return self._flat_data(input_vector, len(chunk[col_name][0:count].tolist()))
except Exception as e :
raise Exception (e)
finally:
store.close()
def _nlp_parser(self, file_path, index):
"""
parse nlp data
:return:
"""
try :
store = pd.HDFStore(file_path)
chunk = store.select('table1',
start=index.start,
stop=index.stop)
count = index.stop - index.start
if (self.encode_col in chunk):
encode = self.encode_pad(self._preprocess(chunk[self.encode_col].values)[0:count],
max_len=self.encode_len)
return self._word_embed_data(self.embed_type, encode)
else:
warnings.warn("not exists column names requested !!")
return [['#'] * self.encode_len]
except Exception as e :
raise Exception (e)
finally:
store.close()
def _convert_data_format(self, file_path, index):
"""
:param obj:
:param index:
:return:
"""
try :
return_data = []
store = pd.HDFStore(file_path)
chunk = store.select('table1',
start=index.start,
stop=index.stop)
for column in self.column_list :
for line in self._preprocess(chunk[column].values)[index.start:index.stop] :
return_data = return_data + line
return [return_data]
except Exception as e :
raise Exception (e)
finally:
store.close()
def fake_metta(matrix_dict, metadata):
"""Stores matrix and metadata in a metta-data-like form
Args:
matrix_dict (dict) of form { columns: values }.
Expects an entity_id to be present which it will use as the index
metadata (dict). Any metadata that should be set
Yields:
tuple of filenames for matrix and metadata
"""
matrix = pandas.DataFrame.from_dict(matrix_dict).set_index('entity_id')
with tempfile.NamedTemporaryFile() as matrix_file:
with tempfile.NamedTemporaryFile('w') as metadata_file:
hdf = pandas.HDFStore(matrix_file.name)
hdf.put('title', matrix, data_columns=True)
matrix_file.seek(0)
yaml.dump(metadata, metadata_file)
metadata_file.seek(0)
yield (matrix_file.name, metadata_file.name)
def _read_hdf_from_buffer(self, buffer):
with pandas.HDFStore(
"data.h5",
mode="r",
driver="H5FD_CORE",
driver_core_backing_store=0,
driver_core_image=buffer.read()) as store:
if len(store.keys()) > 1:
raise Exception('Ambiguous matrix store. More than one dataframe in the hdf file.')
try:
return store["matrix"]
except KeyError:
print("The hdf file should contain one and only key, matrix.")
return store[store.keys()[0]]
def test_grbm_reload():
vis_layer = layers.BernoulliLayer(num_vis)
hid_layer = layers.GaussianLayer(num_hid)
# create some extrinsics
grbm = model.Model([vis_layer, hid_layer])
with tempfile.NamedTemporaryFile() as file:
# save the model
store = pandas.HDFStore(file.name, mode='w')
grbm.save(store)
store.close()
# reload
store = pandas.HDFStore(file.name, mode='r')
grbm_reload = model.Model.from_saved(store)
store.close()
# check the two models are consistent
vis_data = vis_layer.random((num_samples, num_vis))
data_state = model_utils.State.from_visible(vis_data, grbm)
dropout_scale = model_utils.State.dropout_rescale(grbm)
vis_orig = grbm.deterministic_iteration(1, data_state, dropout_scale).units[0]
vis_reload = grbm_reload.deterministic_iteration(1, data_state, dropout_scale).units[0]
assert be.allclose(vis_orig, vis_reload)
def save(self, store: pandas.HDFStore) -> None:
"""
Save a model to an open HDFStore.
Notes:
Performs an IO operation.
Args:
store (pandas.HDFStore)
Returns:
None
"""
# save the config as an attribute
config = self.get_config()
store.put('model', pandas.DataFrame())
store.get_storer('model').attrs.config = config
# save the parameters
for i in range(self.num_weights):
key = os.path.join('weights', 'weights'+str(i))
self.weights[i].save_params(store, key)
for i in range(self.num_layers):
key = os.path.join('layers', 'layers'+str(i))
self.layers[i].save_params(store, key)
def save_params(self, store, key):
"""
Save the parameters to a HDFStore.
Notes:
Performs an IO operation.
Args:
store (pandas.HDFStore): the writeable stream for the params.
key (str): the path for the layer params.
Returns:
None
"""
for i, ip in enumerate(self.params):
df_params = pandas.DataFrame(be.to_numpy_array(ip))
store.put(os.path.join(key, 'parameters', 'key'+str(i)), df_params)
def save_params(self, store, key):
"""
Save the parameters to a HDFStore.
Notes:
Performs an IO operation.
Args:
store (pandas.HDFStore): the writeable stream for the params.
key (str): the path for the layer params.
Returns:
None
"""
for i, ip in enumerate(self.params):
df_params = pandas.DataFrame(be.to_numpy_array(ip))
store.put(os.path.join(key, 'parameters', 'key'+str(i)), df_params)
def load_params(self, store, key):
"""
Load the parameters from an HDFStore.
Notes:
Performs an IO operation.
Args:
store (pandas.HDFStore): the readable stream for the params.
key (str): the path for the layer params.
Returns:
None
"""
params = []
for i, ip in enumerate(self.params):
params.append(be.float_tensor(
store.get(os.path.join(key, 'parameters', 'key'+str(i))).as_matrix()
).squeeze()) # collapse trivial dimensions to a vector
self.params = self.params.__class__(*params)
def get_trajs_mat(self, cols, traj):
if traj == 'avg_probe_pp':
with pd.HDFStore(self.pp_traj_df_path, mode='r') as store:
df_traj = store.select('pp_traj_df', columns=cols)
trajs_mat = df_traj.values.transpose()
elif traj == 'avg_probe_ba':
with pd.HDFStore(self.ba_traj_df_path, mode='r') as store:
df_traj = store.select('ba_traj_df', columns=cols)
trajs_mat = df_traj.values.transpose()
elif 'cat_task' in traj:
with pd.HDFStore(self.cat_task_traj_df_path, mode='r') as store:
columns = [traj.replace('cat_task_', '') + '_fold{}'.format(i) for i in cols]
df_traj = store.select('cat_task_traj_df', columns=columns)
trajs_mat = df_traj.values.transpose()
elif 'syn_task' in traj:
with pd.HDFStore(self.syn_task_traj_df_path, mode='r') as store:
columns = [traj.replace('syn_task_', '') + '_fold{}'.format(i) for i in cols]
df_traj = store.select('syn_task_traj_df', columns=columns)
trajs_mat = df_traj.values.transpose()
else:
raise AttributeError('rnnlab: Invalid argument passed to "traj".')
return trajs_mat
def __iter__(self, gen_type='train', batch_size=None, shuffle_block=False, random_sample=False, split_fields=False,
on_disk=True, squeeze_output=False, **kwargs):
gen_type = gen_type.lower()
if on_disk:
print('on disk...')
for hdf_X, hdf_y in self._files_iter_(gen_type=gen_type, shuffle_block=shuffle_block):
# num_of_lines = pd.HDFStore(hdf_y, mode='r').get_storer('fixed').shape[0]
X_all = pd.read_hdf(hdf_X, mode='r').as_matrix()
y_all = pd.read_hdf(hdf_y, mode='r').as_matrix()
gen = self.generator(X_all, y_all, batch_size, shuffle=random_sample)
for X, y in gen:
if split_fields:
X = np.split(X, self.max_length, axis=1)
for i in range(self.max_length):
X[i] -= self.feat_min[i]
if squeeze_output:
y = y.squeeze()
yield X, y
else:
print('not implemented')
def read_offsets(offsets_file):
offsets = np.zeros(
shape=(8, 2, 4096, 40),
dtype='f4')
def name_to_channel_gain_id(name):
_, channel, gain = name.split('_')
channel = int(channel)
gain_id = {'high': 0, 'low': 1}[gain]
return channel, gain_id
with pd.HDFStore(offsets_file) as st:
for name in st.keys():
channel, gain_id = name_to_channel_gain_id(name)
df = st[name]
df.sort_values(["cell", "sample"], inplace=True)
offsets[channel, gain_id] = df["median"].values.reshape(-1, 40)
return offsets
def cacheData(self, data_specification):
'''
Cache Kepler data locally
@param data_specification: List of kepler IDs
'''
kid_list = data_specification
data_location = DataFetcher.getDataLocation('kepler')
if data_location == None:
data_location = os.path.join(os.path.expanduser('~'),'.skdaccess','kepler')
os.makedirs(data_location, exist_ok=True)
data_location = os.path.join(data_location, 'kepler_data.h5')
DataFetcher.setDataLocation('kepler', data_location)
store = pd.HDFStore(data_location)
missing_kid_list = []
for kid in kid_list:
if 'kid_' + kid not in store:
missing_kid_list.append(kid)
if len(missing_kid_list) > 0:
print("Downloading data for " + str(len(missing_kid_list)) + " star(s)")
missing_kid_data = self.downloadKeplerData(missing_kid_list)
for kid,data in missing_kid_data.items():
store.put('kid_' + kid, data)
store.close()
def output(self):
'''
Output kepler data wrapper
@return DataWrapper
'''
kid_list = self.ap_paramList[0]()
kid_list = [ str(kid).zfill(9) for kid in kid_list ]
self.cacheData(kid_list)
data_location = DataFetcher.getDataLocation('kepler')
kid_data = dict()
store = pd.HDFStore(data_location)
for kid in kid_list:
kid_data[kid] = store['kid_' + kid]
# If downloaded using old skdaccess version
# switch index
if kid_data[kid].index.name == 'TIME':
kid_data[kid]['TIME'] = kid_data[kid].index
kid_data[kid].set_index('CADENCENO', inplace=True)
store.close()
kid_data = OrderedDict(sorted(kid_data.items(), key=lambda t: t[0]))
# If a list of quarters is specified, only select data in those quarters
if self.quarter_list != None:
for kid in kid_list:
kid_data[kid] = kid_data[kid][kid_data[kid]['QUARTER'].isin(self.quarter_list)]
return TableWrapper(kid_data, default_columns = ['PDCSAP_FLUX'], default_error_columns = ['PDCSAP_FLUX_ERR'])