def test_one_iteration_no_metadata(self):
columns = pd.MultiIndex.from_product([[1, 200], [1]],
names=['depth', 'iter'])
data = pd.DataFrame(data=[[1, 2], [1, 2], [1, 2]],
columns=columns, index=['S1', 'S2', 'S3'])
# No counts provided because no metadata
obs = _compute_summary(data, 'sample-id')
d = [['S1', 1, 1, 1., 1., 1., 1., 1., 1., 1., 1., 1.],
['S1', 200, 1, 2., 2., 2., 2., 2., 2., 2., 2., 2.],
['S2', 1, 1, 1., 1., 1., 1., 1., 1., 1., 1., 1.],
['S2', 200, 1, 2., 2., 2., 2., 2., 2., 2., 2., 2.],
['S3', 1, 1, 1., 1., 1., 1., 1., 1., 1., 1., 1.],
['S3', 200, 1, 2., 2., 2., 2., 2., 2., 2., 2., 2.]]
exp = pd.DataFrame(data=d, columns=['sample-id', 'depth', 'count',
'min', '2%', '9%', '25%', '50%',
'75%', '91%', '98%', 'max'])
pdt.assert_frame_equal(exp, obs)
python类MultiIndex()的实例源码
def test_two_iterations_no_metadata(self):
columns = pd.MultiIndex.from_product([[1, 200], [1, 2]],
names=['depth', 'iter'])
data = pd.DataFrame(data=[[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]],
columns=columns, index=['S1', 'S2', 'S3'])
# No counts provided because no metadata
obs = _compute_summary(data, 'sample-id')
d = [['S1', 1, 1, 1., 1.02, 1.09, 1.25, 1.5, 1.75, 1.91, 1.98, 2.],
['S1', 200, 1, 3., 3.02, 3.09, 3.25, 3.5, 3.75, 3.91, 3.98, 4.],
['S2', 1, 1, 1., 1.02, 1.09, 1.25, 1.5, 1.75, 1.91, 1.98, 2.],
['S2', 200, 1, 3., 3.02, 3.09, 3.25, 3.5, 3.75, 3.91, 3.98, 4.],
['S3', 1, 1, 1., 1.02, 1.09, 1.25, 1.5, 1.75, 1.91, 1.98, 2.],
['S3', 200, 1, 3., 3.02, 3.09, 3.25, 3.5, 3.75, 3.91, 3.98, 4.]]
exp = pd.DataFrame(data=d, columns=['sample-id', 'depth', 'count',
'min', '2%', '9%', '25%', '50%',
'75%', '91%', '98%', 'max'])
pdt.assert_frame_equal(exp, obs)
def test_unique_metadata_groups(self):
columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1),
(200, 2), ('pet', '')],
names=['depth', 'iter'])
data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'milo'],
[9, 10, 11, 12, 'peanut']],
columns=columns, index=['S1', 'S2', 'S3'])
obs = _reindex_with_metadata('pet', ['pet'], data)
exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=['depth', 'iter'])
exp_ind = pd.Index(['milo', 'peanut', 'russ'], name='pet')
exp = pd.DataFrame(data=[[5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4]],
columns=exp_col, index=exp_ind)
pdt.assert_frame_equal(exp, obs[0])
exp = pd.DataFrame(data=[[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
columns=exp_col, index=exp_ind)
pdt.assert_frame_equal(exp, obs[1])
def test_multiple_categories(self):
columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1),
(200, 2), ('pet', ''),
('toy', '')],
names=['depth', 'iter'])
data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ', 'stick'],
[5, 6, 7, 8, 'milo', 'yeti'],
[9, 10, 11, 12, 'peanut', 'stick']],
columns=columns, index=['S1', 'S2', 'S3'])
obs = _reindex_with_metadata('pet', ['pet', 'toy'], data)
exp_col = pd.MultiIndex(levels=[[1, 200, 'pet', 'toy'], [1, 2, '']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=['depth', 'iter'])
exp_ind = pd.Index(['milo', 'peanut', 'russ'], name='pet')
exp = pd.DataFrame(data=[[5, 6, 7, 8], [9, 10, 11, 12], [1, 2, 3, 4]],
columns=exp_col, index=exp_ind)
pdt.assert_frame_equal(exp, obs[0])
exp = pd.DataFrame(data=[[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
columns=exp_col, index=exp_ind)
pdt.assert_frame_equal(exp, obs[1])
obs = _reindex_with_metadata('toy', ['pet', 'toy'], data)
exp_ind = pd.Index(['stick', 'yeti'], name='toy')
exp = pd.DataFrame(data=[[5, 6, 7, 8], [5, 6, 7, 8]],
columns=exp_col, index=exp_ind)
pdt.assert_frame_equal(exp, obs[0])
exp = pd.DataFrame(data=[[2, 2, 2, 2], [1, 1, 1, 1]],
columns=exp_col, index=exp_ind)
pdt.assert_frame_equal(exp, obs[1])
def normalize_segment_names(dataframe, inplace=False):
"""
Makes the segment index of the dataframe have names which correspond to the original .mat segment names.
:param dataframe: The dataframe with segment names
:param inplace: If True, the segment index will be changed in place in the given data frame.
:return: A DataFrame where the segment name part of the index has been canonicalized. If inplace is True, the
orignal dataframe is returned, otherwise a copy is returned.
"""
index_values = dataframe.index.get_values()
fixed_values = [(fileutils.get_segment_name(filename), frame) for filename, frame in index_values]
if not inplace:
dataframe = dataframe.copy()
dataframe.index = pd.MultiIndex.from_tuples(fixed_values, names=dataframe.index.names)
return dataframe
def reshape_frames(dataframe, frame_length=12):
"""
Returns a new dataframe with the given frame length.
:param dataframe: A pandas DataFrame with one window per row.
:param frame_length: The desired number of windows for each feature frame. Must divide the number of windows in
*dataframe* evenly.
:return: A new pandas DataFrame with the desired window frame width. The columns of the new data-frame will be
multi-index so that
future concatenation of data frames align properly.
"""
# Assert that the length of the data frame is divisible by
# frame_length
n_windows, window_width = dataframe.shape
if n_windows % frame_length != 0:
raise ValueError("The dataframe has {} windows which"
" is not divisible by the frame"
" length {}".format(n_windows, frame_length))
values = dataframe.values
n_frames = n_windows / frame_length
frame_width = window_width * frame_length
window_columns = dataframe.columns
column_index = pd.MultiIndex.from_product([range(frame_length),
window_columns],
names=['window', 'feature'])
reshaped_frame = pd.DataFrame(data=values.reshape(n_frames,
frame_width),
columns=column_index)
reshaped_frame.sortlevel(axis=1)
return reshaped_frame
def get_zeroth_quarter_idx(self, stacked_last_per_qtr):
"""
Filters for releases that are on or after each simulation date and
determines the next quarter by picking out the upcoming release for
each date in the index.
Parameters
----------
stacked_last_per_qtr : pd.DataFrame
A DataFrame with index of calendar dates, sid, and normalized
quarters with each row being the latest estimate for the row's
index values, sorted by event date.
Returns
-------
next_releases_per_date_index : pd.MultiIndex
An index of calendar dates, sid, and normalized quarters, for only
the rows that have a next event.
"""
next_releases_per_date = stacked_last_per_qtr.loc[
stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] >=
stacked_last_per_qtr.index.get_level_values(SIMULATION_DATES)
].groupby(
level=[SIMULATION_DATES, SID_FIELD_NAME],
as_index=False,
# Here we take advantage of the fact that `stacked_last_per_qtr` is
# sorted by event date.
).nth(0)
return next_releases_per_date.index
def get_zeroth_quarter_idx(self, stacked_last_per_qtr):
"""
Filters for releases that are on or after each simulation date and
determines the previous quarter by picking out the most recent
release relative to each date in the index.
Parameters
----------
stacked_last_per_qtr : pd.DataFrame
A DataFrame with index of calendar dates, sid, and normalized
quarters with each row being the latest estimate for the row's
index values, sorted by event date.
Returns
-------
previous_releases_per_date_index : pd.MultiIndex
An index of calendar dates, sid, and normalized quarters, for only
the rows that have a previous event.
"""
previous_releases_per_date = stacked_last_per_qtr.loc[
stacked_last_per_qtr[EVENT_DATE_FIELD_NAME] <=
stacked_last_per_qtr.index.get_level_values(SIMULATION_DATES)
].groupby(
level=[SIMULATION_DATES, SID_FIELD_NAME],
as_index=False,
# Here we take advantage of the fact that `stacked_last_per_qtr` is
# sorted by event date.
).nth(-1)
return previous_releases_per_date.index
def validate(self, obj, value):
value = super(PandasDataFrame, self).validate(obj, value)
if self.get_metadata('lexsort'):
if isinstance(value.columns, pd.MultiIndex):
value = value.sortlevel(0, axis=1)
return value
def validate(self, obj, value):
value = super(PandasDataFrame, self).validate(obj, value)
if self.get_metadata('lexsort'):
if isinstance(value.columns, pd.MultiIndex):
value = value.sortlevel(0, axis=1)
return value
def testTwoDimensionalDistribution(self):
df = pd.DataFrame({"X": [1, 1, 1, 2, 2, 3, 4],
"Y": [1, 2, 0, 1, 1, 1, 1],
"Z": [1, 0, 0, 0, 0, 0, 0]})
weights = np.array([1, 1, 1, 1, 1, 1, 1])
metric = metrics.Distribution("X", ["Y", "Z"])
output = metric(df, weights)
correct = pd.DataFrame(
np.array([1 / 14., 1 / 14., 1 / 14., 11 / 14.]),
columns=[""],
index=pd.MultiIndex(levels=[[0, 1, 2], [0, 1]],
labels=[[1, 2, 0, 1], [1, 0, 0, 0]],
names=["Y", "Z"]))
self.assertTrue(output.equals(correct))
def testShuffledDataframeRelativeToJackknife(self):
# Same as test above, but also testing that reordering the data doesn't
# change results, up to order.
df = pd.DataFrame({"X": range(11),
"Y": np.concatenate((np.zeros(6), np.ones(5))),
"Z": np.concatenate((np.zeros(3), np.ones(8)))})
metric = metrics.Distribution("X", ["Z"])
se_method = standard_errors.Jackknife()
output = core.Analyze(df.iloc[np.random.permutation(11)]).relative_to(
comparisons.AbsoluteDifference("Y", 0)).with_standard_errors(
se_method).calculate(metric).run()
output = (output.
reset_index().
sort_values(by=["Y", "Z"]).
set_index(["Y", "Z"]))
correct = pd.DataFrame(
np.array([[-0.2, 0.18100283490],
[0.2, 0.18100283490]]),
columns=["X Distribution Absolute Difference",
"X Distribution Absolute Difference Jackknife SE"],
index=pd.MultiIndex(levels=[[1.], [0., 1.]],
labels=[[0, 0], [0, 1]],
names=["Y", "Z"]))
correct = (correct.
reset_index().
sort_values(by=["Y", "Z"]).
set_index(["Y", "Z"]))
self.assertTrue(all(output.index == correct.index) and
all(output.columns == correct.columns) and
np.all(abs(output.values - correct.values) < 1e-10))
def groupby_deco(func):
def func_wrapper(self, thing, *args, **kwargs):
if isinstance(thing, pd.core.groupby.DataFrameGroupBy):
agg = thing.apply(lambda x: func(self, x, *args, **kwargs))
is_series = isinstance(agg, pd.core.series.Series)
has_multiindex = isinstance(agg.index, pd.MultiIndex)
if is_series and has_multiindex:
return agg.unstack()
else:
return agg
return func(self, thing, *args, **kwargs)
return func_wrapper
common.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 39
收藏 0
点赞 0
评论 0
def _isnull_new(obj):
if lib.isscalar(obj):
return lib.checknull(obj)
# hack (for now) because MI registers as ndarray
elif isinstance(obj, pd.MultiIndex):
raise NotImplementedError("isnull is not defined for MultiIndex")
elif isinstance(obj, (ABCSeries, np.ndarray, pd.Index)):
return _isnull_ndarraylike(obj)
elif isinstance(obj, ABCGeneric):
return obj._constructor(obj._data.isnull(func=isnull))
elif isinstance(obj, list) or hasattr(obj, '__array__'):
return _isnull_ndarraylike(np.asarray(obj))
else:
return obj is None
test_timeseries.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 36
收藏 0
点赞 0
评论 0
def test_get_level_values_box(self):
from pandas import MultiIndex
dates = date_range('1/1/2000', periods=4)
levels = [dates, [0, 1]]
labels = [[0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 0, 1, 0, 1, 0, 1]]
index = MultiIndex(levels=levels, labels=labels)
self.assertTrue(isinstance(index.get_level_values(0)[0], Timestamp))
test_base.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 34
收藏 0
点赞 0
评论 0
def setUp(self):
self.indices = dict(unicodeIndex=tm.makeUnicodeIndex(100),
strIndex=tm.makeStringIndex(100),
dateIndex=tm.makeDateIndex(100),
periodIndex=tm.makePeriodIndex(100),
tdIndex=tm.makeTimedeltaIndex(100),
intIndex=tm.makeIntIndex(100),
rangeIndex=tm.makeIntIndex(100),
floatIndex=tm.makeFloatIndex(100),
boolIndex=Index([True, False]),
catIndex=tm.makeCategoricalIndex(100),
empty=Index([]),
tuples=MultiIndex.from_tuples(lzip(
['foo', 'bar', 'baz'], [1, 2, 3])))
self.setup_indices()
test_base.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def test_construction_list_mixed_tuples(self):
# 10697
# if we are constructing from a mixed list of tuples, make sure that we
# are independent of the sorting order
idx1 = Index([('A', 1), 'B'])
self.assertIsInstance(idx1, Index) and self.assertNotInstance(
idx1, MultiIndex)
idx2 = Index(['B', ('A', 1)])
self.assertIsInstance(idx2, Index) and self.assertNotInstance(
idx2, MultiIndex)
test_base.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def test_str_attribute(self):
# GH9068
methods = ['strip', 'rstrip', 'lstrip']
idx = Index([' jack', 'jill ', ' jesse ', 'frank'])
for method in methods:
expected = Index([getattr(str, method)(x) for x in idx.values])
tm.assert_index_equal(
getattr(Index.str, method)(idx.str), expected)
# create a few instances that are not able to use .str accessor
indices = [Index(range(5)), tm.makeDateIndex(10),
MultiIndex.from_tuples([('foo', '1'), ('bar', '3')]),
PeriodIndex(start='2000', end='2010', freq='A')]
for idx in indices:
with self.assertRaisesRegexp(AttributeError,
'only use .str accessor'):
idx.str.repeat(2)
idx = Index(['a b c', 'd e', 'f'])
expected = Index([['a', 'b', 'c'], ['d', 'e'], ['f']])
tm.assert_index_equal(idx.str.split(), expected)
tm.assert_index_equal(idx.str.split(expand=False), expected)
expected = MultiIndex.from_tuples([('a', 'b', 'c'), ('d', 'e', np.nan),
('f', np.nan, np.nan)])
tm.assert_index_equal(idx.str.split(expand=True), expected)
# test boolean case, should return np.array instead of boolean Index
idx = Index(['a1', 'a2', 'b1', 'b2'])
expected = np.array([True, True, False, False])
tm.assert_numpy_array_equal(idx.str.startswith('a'), expected)
self.assertIsInstance(idx.str.startswith('a'), np.ndarray)
s = Series(range(4), index=idx)
expected = Series(range(2), index=['a1', 'a2'])
tm.assert_series_equal(s[s.index.str.startswith('a')], expected)
test_base.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def test_reindex_doesnt_preserve_type_if_target_is_empty_index(self):
# GH7774
idx = pd.Index(list('abc'))
def get_reindex_type(target):
return idx.reindex(target)[0].dtype.type
self.assertEqual(get_reindex_type(pd.Int64Index([])), np.int64)
self.assertEqual(get_reindex_type(pd.Float64Index([])), np.float64)
self.assertEqual(get_reindex_type(pd.DatetimeIndex([])), np.datetime64)
reindexed = idx.reindex(pd.MultiIndex(
[pd.Int64Index([]), pd.Float64Index([])], [[], []]))[0]
self.assertEqual(reindexed.levels[0].dtype.type, np.int64)
self.assertEqual(reindexed.levels[1].dtype.type, np.float64)
test_reshape.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def test_pivot_index_none(self):
# gh-3962
data = {
'index': ['A', 'B', 'C', 'C', 'B', 'A'],
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
'values': [1., 2., 3., 3., 2., 1.]
}
frame = DataFrame(data).set_index('index')
result = frame.pivot(columns='columns', values='values')
expected = DataFrame({
'One': {'A': 1., 'B': 2., 'C': 3.},
'Two': {'A': 1., 'B': 2., 'C': 3.}
})
expected.index.name, expected.columns.name = 'index', 'columns'
assert_frame_equal(result, expected)
# omit values
result = frame.pivot(columns='columns')
expected.columns = pd.MultiIndex.from_tuples([('values', 'One'),
('values', 'Two')],
names=[None, 'columns'])
expected.index.name = 'index'
assert_frame_equal(result, expected, check_names=False)
self.assertEqual(result.index.name, 'index',)
self.assertEqual(result.columns.names, (None, 'columns'))
expected.columns = expected.columns.droplevel(0)
data = {
'index': range(7),
'columns': ['One', 'One', 'One', 'Two', 'Two', 'Two'],
'values': [1., 2., 3., 3., 2., 1.]
}
result = frame.pivot(columns='columns', values='values')
expected.columns.name = 'columns'
assert_frame_equal(result, expected)