def test_repr(self):
a = pd.Series(pd.Categorical([1, 2, 3, 4]))
exp = u("0 1\n1 2\n2 3\n3 4\n" +
"dtype: category\nCategories (4, int64): [1, 2, 3, 4]")
self.assertEqual(exp, a.__unicode__())
a = pd.Series(pd.Categorical(["a", "b"] * 25))
exp = u("0 a\n1 b\n" + " ..\n" + "48 a\n49 b\n" +
"dtype: category\nCategories (2, object): [a, b]")
with option_context("display.max_rows", 5):
self.assertEqual(exp, repr(a))
levs = list("abcdefghijklmnopqrstuvwxyz")
a = pd.Series(pd.Categorical(
["a", "b"], categories=levs, ordered=True))
exp = u("0 a\n1 b\n" + "dtype: category\n"
"Categories (26, object): [a < b < c < d ... w < x < y < z]")
self.assertEqual(exp, a.__unicode__())
python类Categorical()的实例源码
test_categorical.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 34
收藏 0
点赞 0
评论 0
test_categorical.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def test_categorical_series_repr(self):
s = pd.Series(pd.Categorical([1, 2, 3]))
exp = """0 1
1 2
2 3
dtype: category
Categories (3, int64): [1, 2, 3]"""
self.assertEqual(repr(s), exp)
s = pd.Series(pd.Categorical(np.arange(10)))
exp = """0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: category
Categories (10, int64): [0, 1, 2, 3, ..., 6, 7, 8, 9]"""
self.assertEqual(repr(s), exp)
test_categorical.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def test_categorical_series_repr_ordered(self):
s = pd.Series(pd.Categorical([1, 2, 3], ordered=True))
exp = """0 1
1 2
2 3
dtype: category
Categories (3, int64): [1 < 2 < 3]"""
self.assertEqual(repr(s), exp)
s = pd.Series(pd.Categorical(np.arange(10), ordered=True))
exp = """0 0
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
dtype: category
Categories (10, int64): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]"""
self.assertEqual(repr(s), exp)
test_categorical.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def test_categorical_series_repr_period_ordered(self):
idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5)
s = pd.Series(pd.Categorical(idx, ordered=True))
exp = """0 2011-01-01 09:00
1 2011-01-01 10:00
2 2011-01-01 11:00
3 2011-01-01 12:00
4 2011-01-01 13:00
dtype: category
Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
2011-01-01 13:00]"""
self.assertEqual(repr(s), exp)
idx = pd.period_range('2011-01', freq='M', periods=5)
s = pd.Series(pd.Categorical(idx, ordered=True))
exp = """0 2011-01
1 2011-02
2 2011-03
3 2011-04
4 2011-05
dtype: category
Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]"""
self.assertEqual(repr(s), exp)
test_categorical.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def test_mode(self):
s = Series(Categorical([1, 1, 2, 4, 5, 5, 5],
categories=[5, 4, 3, 2, 1], ordered=True))
res = s.mode()
exp = Series(Categorical([5], categories=[
5, 4, 3, 2, 1], ordered=True))
tm.assert_series_equal(res, exp)
s = Series(Categorical([1, 1, 1, 4, 5, 5, 5],
categories=[5, 4, 3, 2, 1], ordered=True))
res = s.mode()
exp = Series(Categorical([5, 1], categories=[
5, 4, 3, 2, 1], ordered=True))
tm.assert_series_equal(res, exp)
s = Series(Categorical([1, 2, 3, 4, 5], categories=[5, 4, 3, 2, 1],
ordered=True))
res = s.mode()
exp = Series(Categorical([], categories=[5, 4, 3, 2, 1], ordered=True))
tm.assert_series_equal(res, exp)
test_categorical.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def test_slicing(self):
cat = Series(Categorical([1, 2, 3, 4]))
reversed = cat[::-1]
exp = np.array([4, 3, 2, 1])
self.assert_numpy_array_equal(reversed.__array__(), exp)
df = DataFrame({'value': (np.arange(100) + 1).astype('int64')})
df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])
expected = Series([11, '(0, 25]'], index=['value', 'D'], name=10)
result = df.iloc[10]
tm.assert_series_equal(result, expected)
expected = DataFrame({'value': np.arange(11, 21).astype('int64')},
index=np.arange(10, 20).astype('int64'))
expected['D'] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
result = df.iloc[10:20]
tm.assert_frame_equal(result, expected)
expected = Series([9, '(0, 25]'], index=['value', 'D'], name=8)
result = df.loc[8]
tm.assert_series_equal(result, expected)
test_categorical.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def test_append(self):
cat = pd.Categorical(["a", "b"], categories=["a", "b"])
vals = [1, 2]
df = pd.DataFrame({"cats": cat, "vals": vals})
cat2 = pd.Categorical(["a", "b", "a", "b"], categories=["a", "b"])
vals2 = [1, 2, 1, 2]
exp = pd.DataFrame({"cats": cat2,
"vals": vals2}, index=pd.Index([0, 1, 0, 1]))
res = df.append(df)
tm.assert_frame_equal(exp, res)
# Concat should raise if the two categoricals do not have the same
# categories
cat3 = pd.Categorical(["a", "b"], categories=["a", "b", "c"])
vals3 = [1, 2]
df_wrong_categories = pd.DataFrame({"cats": cat3, "vals": vals3})
def f():
df.append(df_wrong_categories)
self.assertRaises(ValueError, f)
test_categorical.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def test_pickle_v0_14_1(self):
# we have the name warning
# 10482
with tm.assert_produces_warning(UserWarning):
cat = pd.Categorical(values=['a', 'b', 'c'],
categories=['a', 'b', 'c', 'd'],
name='foobar', ordered=False)
pickle_path = os.path.join(tm.get_data_path(),
'categorical_0_14_1.pickle')
# This code was executed once on v0.14.1 to generate the pickle:
#
# cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
# name='foobar')
# with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
#
self.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
test_categorical.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def test_concat_categorical(self):
# See GH 10177
df1 = pd.DataFrame(
np.arange(18, dtype='int64').reshape(6,
3), columns=["a", "b", "c"])
df2 = pd.DataFrame(
np.arange(14, dtype='int64').reshape(7, 2), columns=["a", "c"])
df2['h'] = pd.Series(pd.Categorical(["one", "one", "two", "one", "two",
"two", "one"]))
df_concat = pd.concat((df1, df2), axis=0).reset_index(drop=True)
df_expected = pd.DataFrame(
{'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12],
'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, np.nan, np.nan,
np.nan, np.nan],
'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13]})
df_expected['h'] = pd.Series(pd.Categorical(
[None, None, None, None, None, None, "one", "one", "two", "one",
"two", "two", "one"]))
tm.assert_frame_equal(df_expected, df_concat)
def test_categorical(model_and_func):
formula = 'y ~ 1 + d + x1'
y = np.random.randn(1000)
x1 = np.random.randn(1000)
d = np.random.randint(0, 4, 1000)
d = pd.Categorical(d)
data = pd.DataFrame({'y': y, 'x1': x1, 'd': d})
data['Intercept'] = 1.0
model, func = model_and_func
mod = model.from_formula(formula, data)
res3 = mod.fit()
res2 = func(formula, data).fit()
res = model(data.y, data[['Intercept', 'x1', 'd']], None, None).fit()
assert_allclose(res.rsquared, res2.rsquared)
assert_allclose(res2.rsquared, res3.rsquared)
assert mod.formula == formula
def test_mixed_input(data):
y = PanelData(data.y)
nt = y.values2d.shape[0]
effects = np.random.randint(0, 5, size=nt)
prim = ['a', 'b', 'c', 'd', 'e']
temp = {'effect.0': pd.Categorical(pd.Series(effects, index=y.index)),
'effect.1': pd.Series(np.random.choice(prim, size=nt), index=y.index)}
effects = pd.DataFrame(temp, index=y.index)
mod = PanelOLS(data.y, data.x, other_effects=effects)
mod.fit()
clusters = np.random.randint(0, y.shape[2] // 2, size=(nt, 2))
temp = {}
prim = list(map(lambda s: ''.join(s), list(product(ascii_lowercase, ascii_lowercase))))
temp['var.cluster.0'] = pd.Series(np.random.choice(prim, size=nt), index=y.index)
temp['var.cluster.1'] = pd.Series(clusters[:, 1], index=y.index)
clusters = pd.DataFrame(temp, index=y.index)
mod.fit(cov_type='clustered', clusters=clusters)
def test_general_demean_oneway(panel):
y = PanelData(panel)
dm1 = y.demean('entity')
g = pd.DataFrame(y.entity_ids, index=y.index)
dm2 = y.general_demean(g)
assert_allclose(dm1.values2d, dm2.values2d)
dm1 = y.demean('time')
g = pd.DataFrame(y.time_ids, index=y.index)
dm2 = y.general_demean(g)
assert_allclose(dm1.values2d, dm2.values2d)
g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index)
dm2 = y.general_demean(g)
g = pd.Categorical(g.iloc[:, 0])
d = pd.get_dummies(g)
dm1 = y.values2d - d @ np.linalg.lstsq(d, y.values2d)[0]
assert_allclose(dm1, dm2.values2d)
def test_general_demean_twoway(panel):
y = PanelData(panel)
dm1 = y.demean('both')
g = pd.DataFrame(y.entity_ids, index=y.index)
g['column2'] = pd.Series(y.time_ids.squeeze(), index=y.index)
dm2 = y.general_demean(g)
assert_allclose(dm1.values2d, dm2.values2d)
g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index)
dm2 = y.general_demean(g)
g1 = pd.Categorical(g.iloc[:, 0])
d1 = pd.get_dummies(g1)
g2 = pd.Categorical(g.iloc[:, 1])
d2 = pd.get_dummies(g2, drop_first=True)
d = np.c_[d1, d2]
dm1 = y.values2d - d @ np.linalg.lstsq(d, y.values2d)[0]
assert_allclose(dm1 - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7)
def test_general_weighted_demean_oneway(panel):
y = PanelData(panel)
weights = pd.DataFrame(
np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index)
w = PanelData(weights)
dm1 = y.demean('entity', weights=w)
g = PanelData(pd.DataFrame(y.entity_ids, index=y.index))
dm2 = y.general_demean(g, w)
assert_allclose(dm1.values2d, dm2.values2d)
dm1 = y.demean('time', weights=w)
g = PanelData(pd.DataFrame(y.time_ids, index=y.index))
dm2 = y.general_demean(g, w)
assert_allclose(dm1.values2d, dm2.values2d)
g = PanelData(pd.DataFrame(np.random.randint(0, 10, g.dataframe.shape),
index=y.index))
dm2 = y.general_demean(g, w)
g = pd.Categorical(g.dataframe.iloc[:, 0])
d = pd.get_dummies(g)
wd = np.sqrt(w.values2d) * d
wy = np.sqrt(w.values2d) * y.values2d
dm1 = wy - wd @ np.linalg.lstsq(wd, wy)[0]
assert_allclose(dm1, dm2.values2d, atol=1e-14)
def test_general_unit_weighted_demean_twoway(panel):
np.random.seed(12345)
y = PanelData(panel)
weights = pd.DataFrame(
np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index)
w = PanelData(weights)
dm1 = y.demean('both', weights=w)
g = pd.DataFrame(y.entity_ids, index=y.index)
g['column2'] = pd.Series(y.time_ids.squeeze(), index=y.index)
dm2 = y.general_demean(g, weights=w)
assert_allclose(dm1.values2d - dm2.values2d, np.zeros_like(dm2.values2d),
atol=1e-7)
g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index)
dm2 = y.general_demean(g, weights=w)
g1 = pd.Categorical(g.iloc[:, 0])
d1 = pd.get_dummies(g1)
g2 = pd.Categorical(g.iloc[:, 1])
d2 = pd.get_dummies(g2, drop_first=True)
d = np.c_[d1, d2]
wd = np.sqrt(w.values2d) * d
wy = np.sqrt(w.values2d) * y.values2d
dm1 = wy - wd @ np.linalg.lstsq(wd, wy)[0]
assert_allclose(dm1 - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7)
def filtered_table(table,
v_gene_coverage, # at least
j_gene_coverage, # at least
v_gene_evalue, # at most
):
"""
Discard the following rows in the table:
- no J assigned
- stop codon found
- V gene coverage less than v_gene_coverage
- J gene coverage less than j_gene_coverage
- V gene E-value greater than v_gene_evalue
Return the filtered table.
"""
stats = FilteringStatistics()
stats.n = len(table)
# Both V and J must be assigned
# (Note V_gene and J_gene columns use empty strings instead of NA)
filtered = table[(table['V_gene'] != '') & (table['J_gene'] != '')][:]
stats.vjassigned = len(filtered)
filtered['V_gene'] = pd.Categorical(filtered['V_gene'])
# Filter out sequences that have a stop codon
filtered = filtered[filtered.stop == 'no']
stats.stop = len(filtered)
# Filter out sequences with a too low V gene hit E-value
filtered = filtered[filtered.V_evalue <= v_gene_evalue]
stats.v_evalue = len(filtered)
# Filter out sequences with too low V gene coverage
filtered = filtered[filtered.V_covered >= v_gene_coverage]
stats.v_coverage = len(filtered)
# Filter out sequences with too low J gene coverage
filtered = filtered[filtered.J_covered >= j_gene_coverage]
stats.j_coverage = len(filtered)
return filtered, stats
def generate_agents(df, country, population):
"""
Generate a dataframe of agents for a country where population
is the number of agents to be created.
"""
def max_value(attribute):
return df[attribute].max()
# Turn this on for truly random output from each process.
# pid = mp.current_process()._identity[0]
rand = np.random.mtrand.RandomState(0)
country_data = df[df.index == country].to_dict("records")[0]
gdp = country_data["GDP"]
income_array = gdp / 10 * rand.chisquare(10, population).astype('float32')
unemployment_rate = float(country_data["Unemployment"] / 100.0)
employment_array = rand.choice([True, False], population,
p=[1 - unemployment_rate, unemployment_rate])
attachment_array = (country_data["Fertility"] *
rand.triangular(0.0, 0.5, 1.0, population) /
max_value("Fertility")).astype('float32')
frame = pd.DataFrame({
"Country": pd.Categorical([country] * population, list(df.index)),
"Income": income_array,
"Employed": employment_array.astype('bool'),
"Attachment": attachment_array,
"Location": pd.Categorical([country] * population, list(df.index)),
"Migration": 0,
}, columns=world_columns)
return frame
def y_transform(Y, data, flatten):
df_y = data[Y]
# if user input 'int' then function will be "greater than value"
# if user input 'float' then function will be IQR range
# below is for case where prediction is true or false
# but the y-feature is in different format (e.g continuous)
if flatten == 'mean':
df_y = pd.DataFrame(df_y >= df_y.mean())
elif flatten == 'median':
df_y = pd.DataFrame(df_y >= df_y.median())
elif flatten == 'mode':
df_y = pd.DataFrame(df_y >= df_y.mode()[0])
elif type(flatten) == int:
df_y = pd.DataFrame(df_y >= flatten)
elif type(flatten) == float:
df_y = pd.DataFrame(df_y >= df_y.quantile(flatten))
# below is for case where the y-feature is converted in
# to a categorical, either if it's a number or string.
elif flatten == 'cat_string':
df_y = pd.Categorical(df_y)
df_y = pd.DataFrame(pd.Series(df_y).cat.codes)
elif flatten == 'cat_numeric':
df_y = pd.qcut(df_y, 5, duplicates='drop')
df_y = pd.DataFrame(pd.Series(df_y).cat.codes)
# for cases when y-feature is already in the format
# where the prediction output will be.
elif flatten == 'none':
df_y = pd.DataFrame(df_y)
return df_y
def get_scale(self, gg):
"""
Create a scale
"""
# This method does some introspection to save users from
# scale mismatch error. This could happen when the
# aesthetic is mapped to a categorical but the limits
# are not provided in categorical form. We only handle
# the case where the mapping uses an expression to
# conver to categorical e.g `aes(color='factor(cyl)')`.
# However if `'cyl'` column is a categorical and the
# mapping is `aes(color='cyl')`, that will result in
# an error. If later case proves common enough then we
# could inspect the data and be clever based on that too!!
ae = self.aesthetic
series = pd.Series(self.limits)
ae_values = []
# Look through all the mappings for this aesthetic,
# if we detect any factor stuff then we convert the
# limits data to categorical so that the right scale
# can be choosen. This should take care of the most
# common use cases.
for layer in gg.layers:
with suppress(KeyError):
value = layer.mapping[ae]
if isinstance(value, six.string_types):
ae_values.append(value)
for value in ae_values:
if ('factor(' in value or
'Categorical(' in value):
series = pd.Categorical(series)
break
return make_scale(self.aesthetic,
series,
limits=self.limits,
trans=self.trans)
def test_inverse_transform(self):
de = dpp.DummyEncoder()
df = dd.from_pandas(pd.DataFrame({"A": np.arange(10),
"B": pd.Categorical(['a'] * 4 +
['b'] * 6)}),
npartitions=2)
de.fit(df)
assert_eq_df(df, de.inverse_transform(de.transform(df)))
assert_eq_df(df, de.inverse_transform(de.transform(df).values))