python类Categorical()的实例源码

test_categorical.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def test_repr(self):
        a = pd.Series(pd.Categorical([1, 2, 3, 4]))
        exp = u("0    1\n1    2\n2    3\n3    4\n" +
                "dtype: category\nCategories (4, int64): [1, 2, 3, 4]")

        self.assertEqual(exp, a.__unicode__())

        a = pd.Series(pd.Categorical(["a", "b"] * 25))
        exp = u("0     a\n1     b\n" + "     ..\n" + "48    a\n49    b\n" +
                "dtype: category\nCategories (2, object): [a, b]")
        with option_context("display.max_rows", 5):
            self.assertEqual(exp, repr(a))

        levs = list("abcdefghijklmnopqrstuvwxyz")
        a = pd.Series(pd.Categorical(
            ["a", "b"], categories=levs, ordered=True))
        exp = u("0    a\n1    b\n" + "dtype: category\n"
                "Categories (26, object): [a < b < c < d ... w < x < y < z]")
        self.assertEqual(exp, a.__unicode__())
test_categorical.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def test_categorical_series_repr(self):
        s = pd.Series(pd.Categorical([1, 2, 3]))
        exp = """0    1
1    2
2    3
dtype: category
Categories (3, int64): [1, 2, 3]"""

        self.assertEqual(repr(s), exp)

        s = pd.Series(pd.Categorical(np.arange(10)))
        exp = """0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: category
Categories (10, int64): [0, 1, 2, 3, ..., 6, 7, 8, 9]"""

        self.assertEqual(repr(s), exp)
test_categorical.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_categorical_series_repr_ordered(self):
        s = pd.Series(pd.Categorical([1, 2, 3], ordered=True))
        exp = """0    1
1    2
2    3
dtype: category
Categories (3, int64): [1 < 2 < 3]"""

        self.assertEqual(repr(s), exp)

        s = pd.Series(pd.Categorical(np.arange(10), ordered=True))
        exp = """0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
9    9
dtype: category
Categories (10, int64): [0 < 1 < 2 < 3 ... 6 < 7 < 8 < 9]"""

        self.assertEqual(repr(s), exp)
test_categorical.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_categorical_series_repr_period_ordered(self):
        idx = pd.period_range('2011-01-01 09:00', freq='H', periods=5)
        s = pd.Series(pd.Categorical(idx, ordered=True))
        exp = """0   2011-01-01 09:00
1   2011-01-01 10:00
2   2011-01-01 11:00
3   2011-01-01 12:00
4   2011-01-01 13:00
dtype: category
Categories (5, period): [2011-01-01 09:00 < 2011-01-01 10:00 < 2011-01-01 11:00 < 2011-01-01 12:00 <
                         2011-01-01 13:00]"""

        self.assertEqual(repr(s), exp)

        idx = pd.period_range('2011-01', freq='M', periods=5)
        s = pd.Series(pd.Categorical(idx, ordered=True))
        exp = """0   2011-01
1   2011-02
2   2011-03
3   2011-04
4   2011-05
dtype: category
Categories (5, period): [2011-01 < 2011-02 < 2011-03 < 2011-04 < 2011-05]"""

        self.assertEqual(repr(s), exp)
test_categorical.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_mode(self):
        s = Series(Categorical([1, 1, 2, 4, 5, 5, 5],
                               categories=[5, 4, 3, 2, 1], ordered=True))
        res = s.mode()
        exp = Series(Categorical([5], categories=[
                     5, 4, 3, 2, 1], ordered=True))
        tm.assert_series_equal(res, exp)
        s = Series(Categorical([1, 1, 1, 4, 5, 5, 5],
                               categories=[5, 4, 3, 2, 1], ordered=True))
        res = s.mode()
        exp = Series(Categorical([5, 1], categories=[
                     5, 4, 3, 2, 1], ordered=True))
        tm.assert_series_equal(res, exp)
        s = Series(Categorical([1, 2, 3, 4, 5], categories=[5, 4, 3, 2, 1],
                               ordered=True))
        res = s.mode()
        exp = Series(Categorical([], categories=[5, 4, 3, 2, 1], ordered=True))
        tm.assert_series_equal(res, exp)
test_categorical.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_slicing(self):
        cat = Series(Categorical([1, 2, 3, 4]))
        reversed = cat[::-1]
        exp = np.array([4, 3, 2, 1])
        self.assert_numpy_array_equal(reversed.__array__(), exp)

        df = DataFrame({'value': (np.arange(100) + 1).astype('int64')})
        df['D'] = pd.cut(df.value, bins=[0, 25, 50, 75, 100])

        expected = Series([11, '(0, 25]'], index=['value', 'D'], name=10)
        result = df.iloc[10]
        tm.assert_series_equal(result, expected)

        expected = DataFrame({'value': np.arange(11, 21).astype('int64')},
                             index=np.arange(10, 20).astype('int64'))
        expected['D'] = pd.cut(expected.value, bins=[0, 25, 50, 75, 100])
        result = df.iloc[10:20]
        tm.assert_frame_equal(result, expected)

        expected = Series([9, '(0, 25]'], index=['value', 'D'], name=8)
        result = df.loc[8]
        tm.assert_series_equal(result, expected)
test_categorical.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def test_append(self):
        cat = pd.Categorical(["a", "b"], categories=["a", "b"])
        vals = [1, 2]
        df = pd.DataFrame({"cats": cat, "vals": vals})
        cat2 = pd.Categorical(["a", "b", "a", "b"], categories=["a", "b"])
        vals2 = [1, 2, 1, 2]
        exp = pd.DataFrame({"cats": cat2,
                            "vals": vals2}, index=pd.Index([0, 1, 0, 1]))

        res = df.append(df)
        tm.assert_frame_equal(exp, res)

        # Concat should raise if the two categoricals do not have the same
        # categories
        cat3 = pd.Categorical(["a", "b"], categories=["a", "b", "c"])
        vals3 = [1, 2]
        df_wrong_categories = pd.DataFrame({"cats": cat3, "vals": vals3})

        def f():
            df.append(df_wrong_categories)

        self.assertRaises(ValueError, f)
test_categorical.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_pickle_v0_14_1(self):

        # we have the name warning
        # 10482
        with tm.assert_produces_warning(UserWarning):
            cat = pd.Categorical(values=['a', 'b', 'c'],
                                 categories=['a', 'b', 'c', 'd'],
                                 name='foobar', ordered=False)
        pickle_path = os.path.join(tm.get_data_path(),
                                   'categorical_0_14_1.pickle')
        # This code was executed once on v0.14.1 to generate the pickle:
        #
        # cat = Categorical(labels=np.arange(3), levels=['a', 'b', 'c', 'd'],
        #                   name='foobar')
        # with open(pickle_path, 'wb') as f: pickle.dump(cat, f)
        #
        self.assert_categorical_equal(cat, pd.read_pickle(pickle_path))
test_categorical.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def test_concat_categorical(self):
        # See GH 10177
        df1 = pd.DataFrame(
            np.arange(18, dtype='int64').reshape(6,
                                                 3), columns=["a", "b", "c"])

        df2 = pd.DataFrame(
            np.arange(14, dtype='int64').reshape(7, 2), columns=["a", "c"])
        df2['h'] = pd.Series(pd.Categorical(["one", "one", "two", "one", "two",
                                             "two", "one"]))

        df_concat = pd.concat((df1, df2), axis=0).reset_index(drop=True)

        df_expected = pd.DataFrame(
            {'a': [0, 3, 6, 9, 12, 15, 0, 2, 4, 6, 8, 10, 12],
             'b': [1, 4, 7, 10, 13, 16, np.nan, np.nan, np.nan, np.nan, np.nan,
                   np.nan, np.nan],
             'c': [2, 5, 8, 11, 14, 17, 1, 3, 5, 7, 9, 11, 13]})
        df_expected['h'] = pd.Series(pd.Categorical(
            [None, None, None, None, None, None, "one", "one", "two", "one",
             "two", "two", "one"]))

        tm.assert_frame_equal(df_expected, df_concat)
test_formulas.py 文件源码 项目:linearmodels 作者: bashtage 项目源码 文件源码 阅读 18 收藏 0 点赞 0 评论 0
def test_categorical(model_and_func):
    formula = 'y ~ 1 + d + x1'
    y = np.random.randn(1000)
    x1 = np.random.randn(1000)
    d = np.random.randint(0, 4, 1000)
    d = pd.Categorical(d)
    data = pd.DataFrame({'y': y, 'x1': x1, 'd': d})
    data['Intercept'] = 1.0
    model, func = model_and_func
    mod = model.from_formula(formula, data)
    res3 = mod.fit()
    res2 = func(formula, data).fit()
    res = model(data.y, data[['Intercept', 'x1', 'd']], None, None).fit()

    assert_allclose(res.rsquared, res2.rsquared)
    assert_allclose(res2.rsquared, res3.rsquared)
    assert mod.formula == formula
test_cluster_input_formats.py 文件源码 项目:linearmodels 作者: bashtage 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def test_mixed_input(data):
    y = PanelData(data.y)
    nt = y.values2d.shape[0]
    effects = np.random.randint(0, 5, size=nt)
    prim = ['a', 'b', 'c', 'd', 'e']
    temp = {'effect.0': pd.Categorical(pd.Series(effects, index=y.index)),
            'effect.1': pd.Series(np.random.choice(prim, size=nt), index=y.index)}
    effects = pd.DataFrame(temp, index=y.index)
    mod = PanelOLS(data.y, data.x, other_effects=effects)
    mod.fit()

    clusters = np.random.randint(0, y.shape[2] // 2, size=(nt, 2))
    temp = {}
    prim = list(map(lambda s: ''.join(s), list(product(ascii_lowercase, ascii_lowercase))))
    temp['var.cluster.0'] = pd.Series(np.random.choice(prim, size=nt), index=y.index)
    temp['var.cluster.1'] = pd.Series(clusters[:, 1], index=y.index)
    clusters = pd.DataFrame(temp, index=y.index)
    mod.fit(cov_type='clustered', clusters=clusters)
test_data.py 文件源码 项目:linearmodels 作者: bashtage 项目源码 文件源码 阅读 40 收藏 0 点赞 0 评论 0
def test_general_demean_oneway(panel):
    y = PanelData(panel)
    dm1 = y.demean('entity')
    g = pd.DataFrame(y.entity_ids, index=y.index)
    dm2 = y.general_demean(g)
    assert_allclose(dm1.values2d, dm2.values2d)

    dm1 = y.demean('time')
    g = pd.DataFrame(y.time_ids, index=y.index)
    dm2 = y.general_demean(g)
    assert_allclose(dm1.values2d, dm2.values2d)

    g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index)
    dm2 = y.general_demean(g)
    g = pd.Categorical(g.iloc[:, 0])
    d = pd.get_dummies(g)
    dm1 = y.values2d - d @ np.linalg.lstsq(d, y.values2d)[0]
    assert_allclose(dm1, dm2.values2d)
test_data.py 文件源码 项目:linearmodels 作者: bashtage 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def test_general_demean_twoway(panel):
    y = PanelData(panel)
    dm1 = y.demean('both')
    g = pd.DataFrame(y.entity_ids, index=y.index)
    g['column2'] = pd.Series(y.time_ids.squeeze(), index=y.index)
    dm2 = y.general_demean(g)
    assert_allclose(dm1.values2d, dm2.values2d)

    g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index)
    dm2 = y.general_demean(g)
    g1 = pd.Categorical(g.iloc[:, 0])
    d1 = pd.get_dummies(g1)
    g2 = pd.Categorical(g.iloc[:, 1])
    d2 = pd.get_dummies(g2, drop_first=True)
    d = np.c_[d1, d2]
    dm1 = y.values2d - d @ np.linalg.lstsq(d, y.values2d)[0]
    assert_allclose(dm1 - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7)
test_data.py 文件源码 项目:linearmodels 作者: bashtage 项目源码 文件源码 阅读 46 收藏 0 点赞 0 评论 0
def test_general_weighted_demean_oneway(panel):
    y = PanelData(panel)
    weights = pd.DataFrame(
        np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index)
    w = PanelData(weights)

    dm1 = y.demean('entity', weights=w)
    g = PanelData(pd.DataFrame(y.entity_ids, index=y.index))
    dm2 = y.general_demean(g, w)
    assert_allclose(dm1.values2d, dm2.values2d)

    dm1 = y.demean('time', weights=w)
    g = PanelData(pd.DataFrame(y.time_ids, index=y.index))
    dm2 = y.general_demean(g, w)
    assert_allclose(dm1.values2d, dm2.values2d)

    g = PanelData(pd.DataFrame(np.random.randint(0, 10, g.dataframe.shape),
                               index=y.index))
    dm2 = y.general_demean(g, w)
    g = pd.Categorical(g.dataframe.iloc[:, 0])
    d = pd.get_dummies(g)
    wd = np.sqrt(w.values2d) * d
    wy = np.sqrt(w.values2d) * y.values2d
    dm1 = wy - wd @ np.linalg.lstsq(wd, wy)[0]
    assert_allclose(dm1, dm2.values2d, atol=1e-14)
test_data.py 文件源码 项目:linearmodels 作者: bashtage 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def test_general_unit_weighted_demean_twoway(panel):
    np.random.seed(12345)
    y = PanelData(panel)
    weights = pd.DataFrame(
        np.random.chisquare(10, (y.dataframe.shape[0], 1)) / 10, index=y.index)
    w = PanelData(weights)

    dm1 = y.demean('both', weights=w)
    g = pd.DataFrame(y.entity_ids, index=y.index)
    g['column2'] = pd.Series(y.time_ids.squeeze(), index=y.index)
    dm2 = y.general_demean(g, weights=w)
    assert_allclose(dm1.values2d - dm2.values2d, np.zeros_like(dm2.values2d),
                    atol=1e-7)

    g = pd.DataFrame(np.random.randint(0, 10, g.shape), index=y.index)
    dm2 = y.general_demean(g, weights=w)
    g1 = pd.Categorical(g.iloc[:, 0])
    d1 = pd.get_dummies(g1)
    g2 = pd.Categorical(g.iloc[:, 1])
    d2 = pd.get_dummies(g2, drop_first=True)
    d = np.c_[d1, d2]
    wd = np.sqrt(w.values2d) * d
    wy = np.sqrt(w.values2d) * y.values2d
    dm1 = wy - wd @ np.linalg.lstsq(wd, wy)[0]
    assert_allclose(dm1 - dm2.values2d, np.zeros_like(dm2.values2d), atol=1e-7)
filter.py 文件源码 项目:IgDiscover 作者: NBISweden 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def filtered_table(table,
        v_gene_coverage,  # at least
        j_gene_coverage,  # at least
        v_gene_evalue,  # at most
    ):
    """
    Discard the following rows in the table:
    - no J assigned
    - stop codon found
    - V gene coverage less than v_gene_coverage
    - J gene coverage less than j_gene_coverage
    - V gene E-value greater than v_gene_evalue

    Return the filtered table.
    """
    stats = FilteringStatistics()
    stats.n = len(table)
    # Both V and J must be assigned
    # (Note V_gene and J_gene columns use empty strings instead of NA)
    filtered = table[(table['V_gene'] != '') & (table['J_gene'] != '')][:]
    stats.vjassigned = len(filtered)
    filtered['V_gene'] = pd.Categorical(filtered['V_gene'])

    # Filter out sequences that have a stop codon
    filtered = filtered[filtered.stop == 'no']
    stats.stop = len(filtered)

    # Filter out sequences with a too low V gene hit E-value
    filtered = filtered[filtered.V_evalue <= v_gene_evalue]
    stats.v_evalue = len(filtered)

    # Filter out sequences with too low V gene coverage
    filtered = filtered[filtered.V_covered >= v_gene_coverage]
    stats.v_coverage = len(filtered)

    # Filter out sequences with too low J gene coverage
    filtered = filtered[filtered.J_covered >= j_gene_coverage]
    stats.j_coverage = len(filtered)

    return filtered, stats
migration.py 文件源码 项目:GOS 作者: crcresearch 项目源码 文件源码 阅读 51 收藏 0 点赞 0 评论 0
def generate_agents(df, country, population):
    """
    Generate a dataframe of agents for a country where population
    is the number of agents to be created.
    """
    def max_value(attribute):
        return df[attribute].max()
    # Turn this on for truly random output from each process.
    # pid = mp.current_process()._identity[0]
    rand = np.random.mtrand.RandomState(0)
    country_data = df[df.index == country].to_dict("records")[0]
    gdp = country_data["GDP"]
    income_array = gdp / 10 * rand.chisquare(10, population).astype('float32')
    unemployment_rate = float(country_data["Unemployment"] / 100.0)
    employment_array = rand.choice([True, False], population,
                                   p=[1 - unemployment_rate, unemployment_rate])
    attachment_array = (country_data["Fertility"] *
                        rand.triangular(0.0, 0.5, 1.0, population) /
                        max_value("Fertility")).astype('float32')
    frame = pd.DataFrame({
        "Country": pd.Categorical([country] * population, list(df.index)),
        "Income": income_array,
        "Employed": employment_array.astype('bool'),
        "Attachment": attachment_array,
        "Location": pd.Categorical([country] * population, list(df.index)),
        "Migration": 0,
    }, columns=world_columns)
    return frame
y_transform.py 文件源码 项目:autonomio 作者: autonomio 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def y_transform(Y, data, flatten):

    df_y = data[Y]

    # if user input 'int' then function will be "greater than value"
    # if user input 'float' then function will be IQR range

    # below is for case where prediction is true or false
    # but the y-feature is in different format (e.g continuous)

    if flatten == 'mean':
        df_y = pd.DataFrame(df_y >= df_y.mean())
    elif flatten == 'median':
        df_y = pd.DataFrame(df_y >= df_y.median())
    elif flatten == 'mode':
        df_y = pd.DataFrame(df_y >= df_y.mode()[0])
    elif type(flatten) == int:
        df_y = pd.DataFrame(df_y >= flatten)
    elif type(flatten) == float:
        df_y = pd.DataFrame(df_y >= df_y.quantile(flatten))

    # below is for case where the y-feature is converted in
    # to a categorical, either if it's a number or string.

    elif flatten == 'cat_string':
        df_y = pd.Categorical(df_y)
        df_y = pd.DataFrame(pd.Series(df_y).cat.codes)

    elif flatten == 'cat_numeric':
        df_y = pd.qcut(df_y, 5, duplicates='drop')
        df_y = pd.DataFrame(pd.Series(df_y).cat.codes)

    # for cases when y-feature is already in the format
    # where the prediction output will be.

    elif flatten == 'none':
        df_y = pd.DataFrame(df_y)

    return df_y
limits.py 文件源码 项目:plotnine 作者: has2k1 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def get_scale(self, gg):
        """
        Create a scale
        """
        # This method does some introspection to save users from
        # scale mismatch error. This could happen when the
        # aesthetic is mapped to a categorical but the limits
        # are not provided in categorical form. We only handle
        # the case where the mapping uses an expression to
        # conver to categorical e.g `aes(color='factor(cyl)')`.
        # However if `'cyl'` column is a categorical and the
        # mapping is `aes(color='cyl')`, that will result in
        # an error. If later case proves common enough then we
        # could inspect the data and be clever based on that too!!
        ae = self.aesthetic
        series = pd.Series(self.limits)
        ae_values = []

        # Look through all the mappings for this aesthetic,
        # if we detect any factor stuff then we convert the
        # limits data to categorical so that the right scale
        # can be choosen. This should take care of the most
        # common use cases.
        for layer in gg.layers:
            with suppress(KeyError):
                value = layer.mapping[ae]
                if isinstance(value, six.string_types):
                        ae_values.append(value)

        for value in ae_values:
            if ('factor(' in value or
                    'Categorical(' in value):
                series = pd.Categorical(series)
                break
        return make_scale(self.aesthetic,
                          series,
                          limits=self.limits,
                          trans=self.trans)
test_data.py 文件源码 项目:dask-ml 作者: dask 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def test_inverse_transform(self):
        de = dpp.DummyEncoder()
        df = dd.from_pandas(pd.DataFrame({"A": np.arange(10),
                                          "B": pd.Categorical(['a'] * 4 +
                                                              ['b'] * 6)}),
                            npartitions=2)
        de.fit(df)
        assert_eq_df(df, de.inverse_transform(de.transform(df)))
        assert_eq_df(df, de.inverse_transform(de.transform(df).values))


问题


面经


文章

微信
公众号

扫码关注公众号