def test_filter_to_numeric(self):
index = pd.Index(['a', 'b', 'c'], dtype=object)
df = pd.DataFrame({'col1': ['2', '1', '3'],
'col2': ['two', 'one', 'three']},
index=index, dtype=object)
metadata = qiime2.Metadata(df)
obs_df = metadata.filter(column_type='numeric').to_dataframe()
exp_df = pd.DataFrame({'col1': [2, 1, 3]}, dtype=np.int, index=index)
pdt.assert_frame_equal(obs_df, exp_df)
df = pd.DataFrame({'col1': ['2', '1', '3'],
'col2': ['2', '1', 'three'],
'col3': ['4.0', '5.2', '6.9']},
index=index, dtype=object)
metadata = qiime2.Metadata(df)
obs_df = metadata.filter(column_type='numeric').to_dataframe()
exp_df = pd.DataFrame({'col1': [2, 1, 3],
'col3': [4.0, 5.2, 6.9]}, index=index)
pdt.assert_frame_equal(obs_df, exp_df)
self.assertEqual(dict(obs_df.dtypes),
{'col1': np.int, 'col3': np.float})
python类Index()的实例源码
def _add_field(self, field):
"""
Adds a new field to the container.
"""
# self.fields is already sorted, so we just need to insert the new
# field in the correct index.
ls = list(self.fields)
insort_left(ls, field)
self.fields = pd.Index(ls)
# unset fillable fields cache
self._ffillable_fields = None
self._realign_fields()
self.last_known_prior_values = self.last_known_prior_values.reindex(
index=self.prior_values_index,
)
return field
def test_some_duplicates_in_category(self):
columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1),
(200, 2), ('pet', '')],
names=['depth', 'iter'])
data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'milo'],
[9, 10, 11, 12, 'russ']],
columns=columns, index=['S1', 'S2', 'S3'])
obs = _reindex_with_metadata('pet', ['pet'], data)
exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=['depth', 'iter'])
exp_ind = pd.Index(['milo', 'russ'], name='pet')
exp = pd.DataFrame(data=[[5, 6, 7, 8], [5, 6, 7, 8]],
columns=exp_col, index=exp_ind)
pdt.assert_frame_equal(exp, obs[0])
exp = pd.DataFrame(data=[[1, 1, 1, 1], [2, 2, 2, 2]],
columns=exp_col, index=exp_ind)
pdt.assert_frame_equal(exp, obs[1])
def test_all_identical(self):
columns = pd.MultiIndex.from_tuples([(1, 1), (1, 2), (200, 1),
(200, 2), ('pet', '')],
names=['depth', 'iter'])
data = pd.DataFrame(data=[[1, 2, 3, 4, 'russ'], [5, 6, 7, 8, 'russ'],
[9, 10, 11, 12, 'russ']],
columns=columns, index=['S1', 'S2', 'S3'])
obs = _reindex_with_metadata('pet', ['pet'], data)
exp_col = pd.MultiIndex(levels=[[1, 200, 'pet'], [1, 2, '']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]],
names=['depth', 'iter'])
exp_ind = pd.Index(['russ'], name='pet')
exp = pd.DataFrame(data=[[5, 6, 7, 8]],
columns=exp_col, index=exp_ind)
pdt.assert_frame_equal(exp, obs[0])
exp = pd.DataFrame(data=[[3, 3, 3, 3]],
columns=exp_col, index=exp_ind)
pdt.assert_frame_equal(exp, obs[1])
def cross_join(df1, df2):
"""
Return a dataframe that is a cross between dataframes
df1 and df2
ref: https://github.com/pydata/pandas/issues/5401
"""
if len(df1) == 0:
return df2
if len(df2) == 0:
return df1
# Add as lists so that the new index keeps the items in
# the order that they are added together
all_columns = pd.Index(list(df1.columns) + list(df2.columns))
df1['key'] = 1
df2['key'] = 1
return pd.merge(df1, df2, on='key').loc[:, all_columns]
def _split_sample(
split: Callable[[object], bool], X: np.ndarray, y: np.ndarray
) -> Tuple[Tuple[np.ndarray, np.ndarray], Tuple[np.ndarray, np.ndarray]]:
"""
Split X, y sample set in two with a split function
:return: ((X_left, y_left), (X_right, y_right))
"""
if split.type is 'numerical':
left_indexes = X[:, split.attribute] < split.criteria
right_indexes = ~left_indexes
else:
Z = (
pd.Index(pd.unique(split.criteria))
.get_indexer(X[:, split.attribute]))
left_indexes = np.where(Z >= 0)[0]
right_indexes = np.where(Z < 0)[0]
left = X[left_indexes], y[left_indexes]
right = X[right_indexes], y[right_indexes]
return left, right
def get_dividend(self, order_book_id, adjusted=True):
"""
????/??????
:param str order_book_id: ???
:param bool adjusted: ?????????
:return:
"""
def fetchData(adjusted):
if adjusted:
mongo_data = self._adjusted_dividends[order_book_id].find({}, {"_id":0})
else:
mongo_data = self._original_dividends[order_book_id].find({}, {"_id":0})
return mongo_data
result = pd.DataFrame({
'book_closure_date': pd.Index(pd.Timestamp(d['book_closure_date']) for d in fetchData(adjusted)),
'ex_dividend_date': pd.Index(pd.Timestamp(d['ex_dividend_date']) for d in fetchData(adjusted)),
'payable_date': pd.Index(pd.Timestamp(d['payable_date']) for d in fetchData(adjusted)),
'dividend_cash_before_tax': [d['dividend_cash_before_tax'] for d in fetchData(adjusted)],
'round_lot': [d['round_lot'] for d in fetchData(adjusted)]
}, index = pd.Index(pd.Timestamp(d['announcement_date']) for d in fetchData(adjusted)))
return result
def get_yield_curve(self, start_date, end_date, tenor):
d1 = start_date.year * 10000 + start_date.month * 100 + start_date.day
d2 = end_date.year * 10000 + end_date.month * 100 + end_date.day
s = self._dates.searchsorted(d1)
e = self._dates.searchsorted(d2, side='right')
if e == len(self._dates):
e -= 1
if self._dates[e] == d2:
# ?? end_date
e += 1
if e < s:
return None
df = pd.DataFrame(self._table[s:e])
df.index = pd.Index(pd.Timestamp(str(d)) for d in df['date'])
del df['date']
df.rename(columns=lambda n: n[1:]+n[0], inplace=True)
if tenor is not None:
return df[tenor]
return df
def get_dividend(self, order_book_id, adjusted=True):
"""
????/??????
:param str order_book_id: ???
:param bool adjusted: ?????????
:return:
"""
def fetchData(adjusted):
if adjusted:
mongo_data = self._adjusted_dividends[order_book_id].find({}, {"_id":0})
else:
mongo_data = self._original_dividends[order_book_id].find({}, {"_id":0})
return mongo_data
result = pd.DataFrame({
'book_closure_date': pd.Index(pd.Timestamp(d['book_closure_date']) for d in fetchData(adjusted)),
'ex_dividend_date': pd.Index(pd.Timestamp(d['ex_dividend_date']) for d in fetchData(adjusted)),
'payable_date': pd.Index(pd.Timestamp(d['payable_date']) for d in fetchData(adjusted)),
'dividend_cash_before_tax': [d['dividend_cash_before_tax'] for d in fetchData(adjusted)],
'round_lot': [d['round_lot'] for d in fetchData(adjusted)]
}, index = pd.Index(pd.Timestamp(d['announcement_date']) for d in fetchData(adjusted)))
return result
def get_yield_curve(self, start_date, end_date, tenor):
d1 = start_date.year * 10000 + start_date.month * 100 + start_date.day
d2 = end_date.year * 10000 + end_date.month * 100 + end_date.day
s = self._dates.searchsorted(d1)
e = self._dates.searchsorted(d2, side='right')
if e == len(self._dates):
e -= 1
if self._dates[e] == d2:
# ?? end_date
e += 1
if e < s:
return None
df = pd.DataFrame(self._table[s:e])
df.index = pd.Index(pd.Timestamp(str(d)) for d in df['date'])
del df['date']
df.rename(columns=lambda n: n[1:]+n[0], inplace=True)
if tenor is not None:
return df[tenor]
return df
def fit_behavioral_data():
"""Fit a model for all subjects. """
df = pd.read_pickle('data.pkl')
subjects = df.index.get_level_values('subject').unique()
data = np.empty((subjects.size, 10))
cues = (0, 1)
for i, subject in enumerate(subjects):
print('Fitting model for subject {}'.format(subject))
df_s = df.loc[subject]
for cue in cues:
ml = ML(df_s[df_s['cue']==cue])
r = ml.ml_estimation()
data[i,2*cue:(2*cue+2)] = r.x
data[i,2*cue+4:2*cue+6] = np.sqrt(np.diag(r.hess_inv.todense()))
data[i,cue+8] = r.fun
model = pd.DataFrame(data, pd.Index(subjects, name='subject'),
['alpha_0', 'beta_0', 'alpha_1', 'beta_1',
'se_alpha_0', 'se_beta_0', 'se_alpha_1', 'se_beta_1',
'NLL_0', 'NLL_1'])
return model
def update_table_models(self, visible=None, hidden=None):
if visible is None and hidden is None:
manager = self.Session.get_manager()
for x in list(manager.hidden_columns):
if x not in self.Session.output_object.columns:
manager.hidden_columns.remove(x)
hidden_cols = pd.Index(manager.hidden_columns)
vis_cols = [x for x in self.Session.output_object.columns
if not x in hidden_cols]
to_show = self.Session.output_object[vis_cols]
to_hide = self.Session.output_object[hidden_cols]
else:
to_show = visible
to_hide = hidden
self.table_model = classes.CoqTableModel(
to_show, session=self.Session)
self.hidden_model = classes.CoqHiddenTableModel(
to_hide, session=self.Session)
self.set_columns_widget()
self.table_model.dataChanged.connect(self.change_userdata)
def json_conversion(obj):
"""Encode additional objects to JSON."""
try:
# numpy isn't an explicit dependency of bowtie
# so we can't assume it's available
import numpy as np
if isinstance(obj, (np.ndarray, np.generic)):
return obj.tolist()
except ImportError:
pass
try:
# pandas isn't an explicit dependency of bowtie
# so we can't assume it's available
import pandas as pd
if isinstance(obj, pd.Index):
return obj.tolist()
except ImportError:
pass
if isinstance(obj, (datetime, time, date)):
return obj.isoformat()
raise TypeError('Not sure how to serialize {} of type {}'.format(obj, type(obj)))
def encoders(obj):
"""Convert Python object to msgpack encodable ones."""
try:
# numpy isn't an explicit dependency of bowtie
# so we can't assume it's available
import numpy as np
if isinstance(obj, (np.ndarray, np.generic)):
# https://docs.scipy.org/doc/numpy/reference/arrays.scalars.html
return obj.tolist()
except ImportError:
pass
try:
# pandas isn't an explicit dependency of bowtie
# so we can't assume it's available
import pandas as pd
if isinstance(obj, pd.Index):
return obj.tolist()
except ImportError:
pass
if isinstance(obj, (datetime, time, date)):
return obj.isoformat()
return obj
def batch_market_order(self, share_counts):
"""Place a batch market order for multiple assets.
Parameters
----------
share_counts : pd.Series[Asset -> int]
Map from asset to number of shares to order for that asset.
Returns
-------
order_ids : pd.Index[str]
Index of ids for newly-created orders.
"""
style = MarketOrder()
order_args = [
(asset, amount, style)
for (asset, amount) in iteritems(share_counts)
if amount
]
return self.blotter.batch_order(order_args)
def test_filter_to_categorical(self):
index = pd.Index(['a', 'b', 'c'], dtype=object)
df = pd.DataFrame({'col1': ['2', '1', '3'],
'col2': ['a', 'b', 'c']},
index=index, dtype=object)
metadata = qiime2.Metadata(df)
obs_df = metadata.filter(column_type='categorical').to_dataframe()
exp_df = pd.DataFrame({'col2': ['a', 'b', 'c']}, index=index)
pdt.assert_frame_equal(obs_df, exp_df)
df = pd.DataFrame({'col1': ['2', '1', '3'],
'col2': ['a', 'b', 'c'],
'col3': ['peanut', 'hotdog', 'gwar']},
index=index, dtype=object)
metadata = qiime2.Metadata(df)
obs_df = metadata.filter(column_type='categorical').to_dataframe()
exp_df = pd.DataFrame({'col2': ['a', 'b', 'c'],
'col3': ['peanut', 'hotdog', 'gwar']},
index=index)
pdt.assert_frame_equal(obs_df, exp_df)
def test_no_columns(self):
fp = pkg_resources.resource_filename(
'qiime2.tests', 'data/metadata/no-columns.tsv')
metadata = qiime2.Metadata.load(fp)
obs_df = metadata.to_dataframe()
exp_index = pd.Index(['a', 'b', 'id'], name='my-index', dtype=object)
exp_df = pd.DataFrame({}, index=exp_index, dtype=object)
self.assertFalse(obs_df.index.empty)
self.assertTrue(obs_df.columns.empty)
pdt.assert_frame_equal(
obs_df, exp_df, check_dtype=True, check_index_type=True,
check_column_type=True, check_frame_type=True, check_names=True,
check_exact=True)
def test_index_and_column_names(self):
md1 = qiime2.Metadata(pd.DataFrame(
{'a': [1, 2]},
index=pd.Index(['id1', 'id2'], name='foo'),
columns=pd.Index(['a'], name='abc')))
md2 = qiime2.Metadata(pd.DataFrame(
{'b': [3, 4]},
index=pd.Index(['id1', 'id2'], name='bar'),
columns=pd.Index(['b'], name='def')))
obs = md1.merge(md2)
exp = qiime2.Metadata(pd.DataFrame(
{'a': [1, 2], 'b': [3, 4]}, index=['id1', 'id2']))
self.assertEqual(obs, exp)
self.assertIsNone(obs._dataframe.index.name)
self.assertIsNone(obs._dataframe.columns.name)
def test_more_complex_expressions(self):
df = pd.DataFrame({'Subject': ['subject-1', 'subject-1', 'subject-2'],
'SampleType': ['gut', 'tongue', 'gut']},
index=pd.Index(['S1', 'S2', 'S3'], name='id'))
metadata = qiime2.Metadata(df)
where = "Subject='subject-1' OR Subject='subject-2'"
actual = metadata.ids(where)
expected = {'S1', 'S2', 'S3'}
self.assertEqual(actual, expected)
where = "Subject='subject-1' AND Subject='subject-2'"
actual = metadata.ids(where)
expected = set()
self.assertEqual(actual, expected)
where = "Subject='subject-1' AND SampleType='gut'"
actual = metadata.ids(where)
expected = {'S1'}
self.assertEqual(actual, expected)
def testMultipleCalculationsRelativeTo(self):
data = pd.DataFrame({"X": (1, 2, 3, 10, 20, 30, 100, 200, 300),
"Y": (0, 1, 2, 3, 4, 5, 6, 7, 8),
"Experiment": ("Control", "Control", "Control", "Exp1",
"Exp1", "Exp1", "Exp2", "Exp2",
"Exp2")})
comparison = comparisons.AbsoluteDifference("Experiment", "Control")
output = core.Analyze(data).relative_to(comparison).calculate(
(metrics.Sum("X"), metrics.Sum("Y"))).run()
correct = pd.DataFrame(
{"sum(X) Absolute Difference": (60 - 6, 600 - 6),
"sum(Y) Absolute Difference": (12 - 3, 21 - 3)},
index=pd.Index(
("Exp1", "Exp2"), name="Experiment"))
self.assertTrue(output.equals(correct))
def testRelativeToJackknife(self):
data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6, 7, 8, 9],
"Y": [0, 0, 0, 1, 1, 1, 2, 2, 2]})
metric = metrics.Sum("X")
comparison = comparisons.AbsoluteDifference("Y", 0)
se_method = standard_errors.Jackknife()
output = core.Analyze(data).relative_to(comparison).with_standard_errors(
se_method).calculate(metric).run()
rowindex = pd.Index([1, 2], name="Y")
correct = pd.DataFrame(
np.array([[9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))],
[18.0, np.sqrt(5 * np.var([21, 20, 19, 11, 10, 9]))]]),
columns=("sum(X) Absolute Difference",
"sum(X) Absolute Difference Jackknife SE"),
index=rowindex)
self.assertTrue(output.equals(correct))
def testRelativeToJackknifeIncludeBaseline(self):
data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6, 7, 8, 9],
"Y": [0, 0, 0, 1, 1, 1, 2, 2, 2]})
metric = metrics.Sum("X")
comparison = comparisons.AbsoluteDifference("Y", 0, include_base=True)
se_method = standard_errors.Jackknife()
output = core.Analyze(data).relative_to(comparison).with_standard_errors(
se_method).calculate(metric).run()
rowindex = pd.Index([0, 1, 2], name="Y")
correct = pd.DataFrame(
np.array([[0.0, 0.0],
[9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))],
[18.0, np.sqrt(5 * np.var([21, 20, 19, 11, 10, 9]))]]),
columns=("sum(X) Absolute Difference",
"sum(X) Absolute Difference Jackknife SE"),
index=rowindex)
self.assertTrue(output.equals(correct))
def testRelativeToJackknifeSingleComparisonBaselineFirst(self):
data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6], "Y": [0, 0, 0, 1, 1, 1]})
metric = metrics.Sum("X")
comparison = comparisons.AbsoluteDifference("Y", 0)
se_method = standard_errors.Jackknife()
output = core.Analyze(data).relative_to(comparison).with_standard_errors(
se_method).calculate(metric).run()
rowindex = pd.Index([1], name="Y")
correct = pd.DataFrame(
np.array([[9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))]]),
columns=("sum(X) Absolute Difference",
"sum(X) Absolute Difference Jackknife SE"),
index=rowindex)
self.assertTrue(output.equals(correct))
def testRelativeToJackknifeSingleComparisonBaselineSecond(self):
data = pd.DataFrame({"X": [1, 2, 3, 4, 5, 6], "Y": [0, 0, 0, 1, 1, 1]})
metric = metrics.Sum("X")
comparison = comparisons.AbsoluteDifference("Y", 1)
se_method = standard_errors.Jackknife()
output = core.Analyze(data).relative_to(comparison).with_standard_errors(
se_method).calculate(metric).run()
rowindex = pd.Index([0], name="Y")
correct = pd.DataFrame(
np.array([[-9.0, np.sqrt(5 * np.var([12, 11, 10, 5, 4, 3]))]]),
columns=("sum(X) Absolute Difference",
"sum(X) Absolute Difference Jackknife SE"),
index=rowindex)
self.assertTrue(output.equals(correct))
def testSplitJackknife(self):
data = pd.DataFrame({"X": np.array([range(11) + [5] * 10]).flatten(),
"Y": np.array([[0] * 11 + [1] * 10]).flatten()})
metric = metrics.Sum("X")
se_method = standard_errors.Jackknife()
output = core.Analyze(data).split_by("Y").with_standard_errors(
se_method).calculate(metric).run()
rowindex = pd.Index([0, 1], name="Y")
correct = pd.DataFrame(
np.array([[55.0, 10.0], [50.0, 0.0]]),
columns=("sum(X)", "sum(X) Jackknife SE"),
index=rowindex)
self.assertTrue(output.equals(correct))
def test_storage_restore_schema_with_primary_key():
data = [
('a',),
('b',),
]
index = pd.Index([1, 2], name='key')
df = pd.DataFrame(data, columns=('value',), index=index)
storage = Storage(dataframes={'data': df})
assert list(storage.read('data')) == [[1, 'a'], [2, 'b']]
assert storage.describe('data') == {
'primaryKey': 'key',
'fields': [
{'name': 'key', 'type': 'integer', 'constraints': {'required': True}},
{'name': 'value', 'type': 'string'},
]
}
def test_dataframe_to_tsv_taxonomy_format(self):
index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
columns = ['Taxon', 'Foo', 'Bar']
df = pd.DataFrame([['taxon1', '42', 'foo'], ['taxon2', '43', 'bar']],
index=index, columns=columns, dtype=object)
exp = (
'Feature ID\tTaxon\tFoo\tBar\n'
'seq1\ttaxon1\t42\tfoo\n'
'seq2\ttaxon2\t43\tbar\n'
)
transformer = self.get_transformer(pd.DataFrame, TSVTaxonomyFormat)
obs = transformer(df)
with obs.open() as fh:
self.assertEqual(fh.read(), exp)
def test_series_to_tsv_taxonomy_format(self):
index = pd.Index(['emrakul', 'peanut'], name='Feature ID',
dtype=object)
series = pd.Series(['taxon1', 'taxon2'],
index=index, name='Taxon', dtype=object)
exp = (
'Feature ID\tTaxon\n'
'emrakul\ttaxon1\n'
'peanut\ttaxon2\n'
)
transformer = self.get_transformer(pd.Series, TSVTaxonomyFormat)
obs = transformer(series)
with obs.open() as fh:
self.assertEqual(fh.read(), exp)
def test_tsv_taxonomy_format_to_metadata(self):
_, obs = self.transform_format(TSVTaxonomyFormat, qiime2.Metadata,
os.path.join('taxonomy',
'3-column.tsv'))
index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
exp_df = pd.DataFrame([['k__Foo; p__Bar', '-1.0'],
['k__Foo; p__Baz', '-42.0']], index=index,
columns=['Taxon', 'Confidence'], dtype=object)
exp = qiime2.Metadata(exp_df)
self.assertEqual(exp, obs)
# In-depth testing of the `_taxonomy_formats_to_dataframe` helper function,
# which does the heavy lifting for the transformers.
def test_3_columns(self):
index = pd.Index(['seq1', 'seq2'], name='Feature ID', dtype=object)
exp = pd.DataFrame([['k__Foo; p__Bar', '-1.0'],
['k__Foo; p__Baz', '-42.0']], index=index,
columns=['Taxon', 'Confidence'], dtype=object)
# has_header=None (default)
obs = _taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy', '3-column.tsv')))
assert_frame_equal(obs, exp)
# has_header=True
obs = _taxonomy_formats_to_dataframe(
self.get_data_path(os.path.join('taxonomy', '3-column.tsv')),
has_header=True)
assert_frame_equal(obs, exp)