def write_legacy_pickles(output_dir):
# make sure we are < 0.13 compat (in py3)
try:
from pandas.compat import zip, cPickle as pickle # noqa
except:
import pickle
version = pandas.__version__
print("This script generates a storage file for the current arch, system, "
"and python version")
print(" pandas version: {0}".format(version))
print(" output dir : {0}".format(output_dir))
print(" storage format: pickle")
pth = '{0}.pickle'.format(platform_name())
fh = open(os.path.join(output_dir, pth), 'wb')
pickle.dump(create_pickle_data(), fh, pickle.HIGHEST_PROTOCOL)
fh.close()
print("created pickle file: %s" % pth)
python类__version__()的实例源码
generate_legacy_storage_files.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
test_pytables.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def test_nan_selection_bug_4858(self):
# GH 4858; nan selection bug, only works for pytables >= 3.1
if LooseVersion(tables.__version__) < '3.1.0':
raise nose.SkipTest('tables version does not support fix for nan '
'selection bug: GH 4858')
with ensure_clean_store(self.path) as store:
df = DataFrame(dict(cols=range(6), values=range(6)),
dtype='float64')
df['cols'] = (df['cols'] + 10).apply(str)
df.iloc[0] = np.nan
expected = DataFrame(dict(cols=['13.0', '14.0', '15.0'], values=[
3., 4., 5.]), index=[3, 4, 5])
# write w/o the index on that particular column
store.append('df', df, data_columns=True, index=['cols'])
result = store.select('df', where='values>2.0')
assert_frame_equal(result, expected)
test_pytables.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def test_legacy_table_write(self):
raise nose.SkipTest("cannot write legacy tables")
store = HDFStore(tm.get_data_path(
'legacy_hdf/legacy_table_%s.h5' % pandas.__version__), 'a')
df = tm.makeDataFrame()
wp = tm.makePanel()
index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'],
['one', 'two', 'three']],
labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3],
[0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
names=['foo', 'bar'])
df = DataFrame(np.random.randn(10, 3), index=index,
columns=['A', 'B', 'C'])
store.append('mi', df)
df = DataFrame(dict(A='foo', B='bar'), index=lrange(10))
store.append('df', df, data_columns=['B'], min_itemsize={'A': 200})
store.append('wp', wp)
store.close()
def get_pandas_status():
try:
import pandas as pd
return _check_version(pd.__version__, pandas_min_version)
except ImportError:
traceback.print_exc()
return default_status
def get_sklearn_status():
try:
import sklearn as sk
return _check_version(sk.__version__, sklearn_min_version)
except ImportError:
traceback.print_exc()
return default_status
def get_numpy_status():
try:
import numpy as np
return _check_version(np.__version__, numpy_min_version)
except ImportError:
traceback.print_exc()
return default_status
def get_scipy_status():
try:
import scipy as sc
return _check_version(sc.__version__, scipy_min_version)
except ImportError:
traceback.print_exc()
return default_status
def get_h2o_status():
try:
import h2o
return _check_version(h2o.__version__, h2o_min_version)
except ImportError:
traceback.print_exc()
return default_status
def sortDataFrame(df, column, ascending, inplace):
if pd.__version__ in ['0.13.0', '0.14.1']:
new_df = df.sort([column], ascending = [ascending], inplace = inplace)
else:
new_df = df.sort_values([column], ascending = [ascending], inplace = inplace)
return new_df
def print_my_path():
print('cwd: {}'.format(getcwd()))
print('__file__:{}'.format(__file__))
print('abspath: {}'.format(path.abspath(__file__)))
print('tensorflow: {}'.format(tf.__version__))
print('pandas: {}'.format(pd.__version__))
def add_date_features_df(col_data, date_col):
# Pandas nicely tries to prevent you from doing stupid things, like setting values on a copy of a df, not your real one
# However, it's a bit overzealous in this case, so we'll side-step a bunch of warnings by setting is_copy to false here
result = {}
col_data = pd.to_datetime(col_data)
if pandas_version < '0.20.0':
result[date_col + '_day_of_week'] = col_data.apply(lambda x: x.weekday()).astype(int, raise_on_error=False)
else:
result[date_col + '_day_of_week'] = col_data.apply(lambda x: x.weekday()).astype(int, errors='ignore')
try:
if pandas_version < '0.20.0':
result[date_col + '_hour'] = col_data.apply(lambda x: x.hour).astype(int, raise_on_error=False)
else:
result[date_col + '_hour'] = col_data.apply(lambda x: x.hour).astype(int, errors='ignore')
result[date_col + '_minutes_into_day'] = col_data.apply(lambda x: x.hour * 60 + x.minute)
result[date_col + '_hour'] = result[date_col + '_hour'].fillna(0)
result[date_col + '_minutes_into_day'] = result[date_col + '_minutes_into_day'].fillna(0)
except AttributeError:
pass
result[date_col + '_is_weekend'] = col_data.apply(lambda x: x.weekday() in (5,6))
result[date_col + '_day_part'] = result[date_col + '_minutes_into_day'].apply(minutes_into_day_parts)
result[date_col + '_day_of_week'] = result[date_col + '_day_of_week'].fillna(0)
result[date_col + '_is_weekend'] = result[date_col + '_is_weekend'].fillna(0)
result[date_col + '_day_part'] = result[date_col + '_day_part'].fillna(0)
return result
# Same logic as above, except implemented for a single dictionary, which is much faster at prediction time when getting just a single prediction
generate_legacy_storage_files.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def platform_name():
return '_'.join([str(pandas.__version__), str(pl.machine()),
str(pl.system().lower()), str(pl.python_version())])
generate_legacy_storage_files.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def write_legacy_msgpack(output_dir, compress):
version = pandas.__version__
print("This script generates a storage file for the current arch, "
"system, and python version")
print(" pandas version: {0}".format(version))
print(" output dir : {0}".format(output_dir))
print(" storage format: msgpack")
pth = '{0}.msgpack'.format(platform_name())
to_msgpack(os.path.join(output_dir, pth), create_msgpack_data(),
compress=compress)
print("created msgpack file: %s" % pth)
def sanity_check():
"""
Report the version number of the core packages we use
:return: Nothing
"""
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
print('numpy: {}'.format(np.__version__))
print('pandas: {}'.format(pd.__version__))
###############################################################################
def sanity_check():
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
print('numpy: {}'.format(np.__version__))
print('pandas: {}'.format(pd.__version__))
def sanity_check():
"""
Report the version number of the core packages we use
:return: Nothing
"""
import matplotlib
print('matplotlib: {}'.format(matplotlib.__version__))
print('numpy: {}'.format(np.__version__))
print('pandas: {}'.format(pd.__version__))
###############################################################################
def check_version(library, min_version):
"""Check minimum library version required
Parameters
----------
library : str
The library name to import. Must have a ``__version__`` property.
min_version : str
The minimum version string. Anything that matches
``'(\\d+ | [a-z]+ | \\.)'``
Returns
-------
ok : bool
True if the library exists with at least the specified version.
"""
ok = True
try:
library = __import__(library)
except ImportError:
ok = False
else:
this_version = LooseVersion(library.__version__)
if this_version < min_version:
ok = False
return ok
def serialize(cls, formatted_data, fh):
# compat: if pandas is old, to_pickle does not accept file handles
if LooseVersion(pd.__version__) <= LooseVersion('0.20.3'):
fh.close()
fh = fh.name
return pd.to_pickle(formatted_data, fh)
def hourly_resample(df, bse=0, minutes=60):
"""
Args:
df:
pandas dataframe containing time series needing resampling
bse (int):
base time to set; optional; default is zero (on the hour);
minutes (int):
sampling recurrence interval in minutes; optional; default is 60 (hourly samples)
Returns:
A Pandas DataFrame that has been resampled to every hour, at the minute defined by the base (bse)
Description:
see http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.resample.html for more info
This function uses pandas powerful time-series manipulation to upsample to every minute, then downsample to every hour,
on the hour.
This function will need adjustment if you do not want it to return hourly samples, or iusgsGisf you are sampling more frequently than
once per minute.
see http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
"""
if int(str(pd.__version__).split('.')[0]) == 0 and int(str(pd.__version__).split('.')[1]) < 18: # pandas versioning
df = df.resample('1Min')
else:
# you can make this smaller to accomodate for a higher sampling frequency
df = df.resample('1Min').first()
# http://pandas.pydata.org/pandas-docs/dev/generated/pandas.Series.interpolate.html
df = df.interpolate(method='time', limit=90)
if int(str(pd.__version__).split('.')[0]) == 0 and int(str(pd.__version__).split('.')[1]) < 18: # pandas versioning
df = df.resample(str(minutes) + 'Min', closed='left', label='left', base=bse)
else:
# modify '60Min' to change the resulting frequency
df = df.resample(str(minutes) + 'Min', closed='left', label='left', base=bse).first()
return df
def _get_versions(requirements=True):
if requirements:
import matplotlib as mpl
import xarray as xr
import pandas as pd
import numpy as np
return {'version': __version__,
'requirements': {'matplotlib': mpl.__version__,
'xarray': xr.__version__,
'pandas': pd.__version__,
'numpy': np.__version__,
'python': ' '.join(sys.version.splitlines())}}
else:
return {'version': __version__}
def fit(self, X_df, y=None):
print('Running basic data cleaning')
self.vals_to_drop = set(['ignore', 'output', 'regressor', 'classifier'])
# See if we should fit TfidfVectorizer or not
for key in X_df.columns:
if X_df[key].dtype == 'object' and self.column_descriptions.get(key, False) not in ['categorical', 'ignore', 'nlp']:
# First, make sure that the values in this column are not just ints, or float('nan')
vals = X_df[key].sample(n=10)
is_categorical = False
for val in vals:
try:
if val is not None:
float(val)
except Exception as e:
print(e)
is_categorical = True
if is_categorical:
print('\n')
print('Encountered a column that is not marked as categorical, but is an "object" pandas type, which typically indicates a categorical column.')
print('The name of this columns is: "{}"'.format(key))
print('Some example features in this column are: {}'.format(list(X_df[key].sample(n=5))))
print('If this is a categorical column, please mark it as `{}: "categorical"` as part of your column_descriptions'.format(key))
print('If this is not a categorical column, please consider converting its dtype before passing data into auto_ml')
print('\n')
warnings.warn('Consider marking the "{}" column as categorical'.format(key))
if self.transformed_column_descriptions.get(key) is None:
self.transformed_column_descriptions[key] = 'continuous'
if key in self.text_columns:
X_df[key].fillna('nan', inplace=True)
if pandas_version < '0.20.0':
text_col = X_df[key].astype(str, raise_on_error=False)
else:
text_col = X_df[key].astype(str, errors='ignore')
self.text_columns[key].fit(text_col)
col_names = self.text_columns[key].get_feature_names()
# Make weird characters play nice, or just ignore them :)
for idx, word in enumerate(col_names):
try:
col_names[idx] = str(word)
except:
col_names[idx] = 'non_ascii_word_' + str(idx)
col_names = ['nlp_' + key + '_' + str(word) for word in col_names]
self.text_columns[key].cleaned_feature_names = col_names
return self
def _attributes(event_path, number_events, alpha, betas, lambda_, cpu_time,
wall_time, function, method=None, attrs=None):
width = max([len(ss) for ss in (event_path,
str(number_events),
str(alpha),
str(betas),
str(lambda_),
function,
str(method),
socket.gethostname(),
getpass.getuser())])
width = max(19, width)
def _format(value):
return '{0: <{width}}'.format(value, width=width)
if not type(alpha) in (float, int):
alpha = 'varying'
new_attrs = {'date': _format(time.strftime("%Y-%m-%d %H:%M:%S")),
'event_path': _format(event_path),
'number_events': _format(number_events),
'alpha': _format(str(alpha)),
'betas': _format(str(betas)),
'lambda': _format(str(lambda_)),
'function': _format(function),
'method': _format(str(method)),
'cpu_time': _format(str(cpu_time)),
'wall_time': _format(str(wall_time)),
'hostname': _format(socket.gethostname()),
'username': _format(getpass.getuser()),
'pyndl': _format(__version__),
'numpy': _format(np.__version__),
'pandas': _format(pd.__version__),
'xarray': _format(xr.__version__),
'cython': _format(cython.__version__)}
if attrs is not None:
for key in set(attrs.keys()) | set(new_attrs.keys()):
if key in attrs:
old_val = attrs[key]
else:
old_val = ''
if key in new_attrs:
new_val = new_attrs[key]
else:
new_val = format_('')
new_attrs[key] = old_val + ' | ' + new_val
return new_attrs