python类Series()的实例源码

history_container.py 文件源码 项目:zipline-chinese 作者: zhanghan1990 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def aggregate_ohlcv_panel(self,
                              fields,
                              ohlcv_panel,
                              items=None,
                              minor_axis=None):
        """
        Convert an OHLCV Panel into a DataFrame by aggregating each field's
        frame into a Series.
        """
        vals = ohlcv_panel
        if isinstance(ohlcv_panel, pd.Panel):
            vals = ohlcv_panel.values
            items = ohlcv_panel.items
            minor_axis = ohlcv_panel.minor_axis

        data = [
            self.frame_to_series(
                field,
                vals[items.get_loc(field)],
                minor_axis
            )
            for field in fields
        ]
        return np.array(data)
outliers.py 文件源码 项目:py-hadoop-tutorial 作者: hougs 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def to_series(tuples):
    """Transforms a list of tuples of the form (date, count) in to a pandas
    series indexed by dt.
    """
    cleaned_time_val_tuples = [tuple for tuple in tuples if not (
        tuple[0] is pd.NaT or tuple[1] is None)]
    if len(cleaned_time_val_tuples) > 0:
        # change list of tuples ie [(a1, b1), (a2, b2), ...] into
        # tuple of lists ie ([a1, a2, ...], [b1, b2, ...])
        unzipped_cleaned_time_values = zip(*cleaned_time_val_tuples)
        # just being explicit about what these are
        counts = unzipped_cleaned_time_values[1]
        timestamps = unzipped_cleaned_time_values[0]
        # Create the series with a sorted index.
        ret_val = pd.Series(counts, index=timestamps).sort_index()
    else:
        ret_val = None
    return ret_val


# In[ ]:
data_container.py 文件源码 项目:xpandas 作者: alan-turing-institute 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def __init__(self, *args, **kwargs):
        '''
        The same arguments as for pandas.Series
        https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html

        In order to create XSeries of any data_type, data argument must be a pythons list.
        For example, to create XSeries of pandas.Series, pass data should be
        data = [s_1, s2, ..., s3] where s_i is a instance of pandas.Series.
        '''
        super(XSeries, self).__init__(*args, **kwargs)

        data = kwargs.get('data')
        if data is None:
            data = args[0]

        check_result, data_type = _check_all_elements_have_the_same_property(data, type)
        if not check_result:
            raise ValueError('Not all elements the same type')

        if data_type is not None:
            self._data_type = data_type
        else:
            self._data_type = type(data._values[0])
bag_of_features_transformer.py 文件源码 项目:xpandas 作者: alan-turing-institute 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def __init__(self, dictionary=None, **kwargs):
        '''
        :param dictionary: custom dictionary to count against. if None, calculate dictionary from dataset
        '''
        self.dictionary = dictionary

        accepted_types = [
            pd.Series, list, np.array, tuple
        ]

        def bag_of_words_transform_function(corpus):
            counter = Counter(corpus)
            for el in self.dictionary:
                if counter.get(el) is None:
                    counter[el] = 0
            return counter

        super(BagOfWordsTransformer, self).__init__(data_types=accepted_types,
                                                    columns=None,
                                                    transform_function=bag_of_words_transform_function)
test_transformer.py 文件源码 项目:xpandas 作者: alan-turing-institute 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def test_mean_transformer():
    s1 = XSeries([
        pd.Series(np.random.normal(size=10)),
        pd.Series(np.random.normal(size=15))
    ])
    s2 = XSeries([
        pd.Series(np.random.normal(size=10)),
        pd.Series(np.random.normal(size=15)),
        pd.Series(np.random.normal(size=100))
    ])

    tr = MeanSeriesTransformer()
    tr = tr.fit(s1)

    transformed_s = tr.transform(s2)

    assert transformed_s.shape[0] == 3
    assert type(transformed_s) == XSeries
test_transformer.py 文件源码 项目:xpandas 作者: alan-turing-institute 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def test_mean_transformer_data_frame():
    s1 = XSeries([
        pd.Series(np.random.normal(size=10)),
        pd.Series(np.random.normal(size=15))
    ])
    s2 = XSeries([
        pd.Series(np.random.normal(size=10)),
        pd.Series(np.random.normal(size=15))
    ])

    df = XDataFrame({
        's1': s1,
        's2': s2
    })

    tr = MeanSeriesTransformer()
    try:
        tr = tr.fit(df)
        assert False
    except:
        assert True
test_data_type.py 文件源码 项目:xpandas 作者: alan-turing-institute 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def test_dataframe_data_types():
    s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']),
                      pd.Series([4, 5, 6], index=['d', 'e', 'g'])])
    s2 = XSeries([1, 2, 3])
    s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}])
    s4 = XSeries(['f', 's', 't'])

    df = XDataFrame({
        'first_col': s1,
        'second_col': s2,
        'third_col': s3,
        'fourth_col': s4
    })

    assert df['first_col'].data_type == pd.Series
    assert df['second_col'].data_type == np.int64
    assert df['third_col'].data_type == dict
    assert df['fourth_col'].data_type == str

    assert type(df[['first_col']]) == XDataFrame
    assert type(df[['first_col', 'second_col']]) == XDataFrame
test_data_type.py 文件源码 项目:xpandas 作者: alan-turing-institute 项目源码 文件源码 阅读 36 收藏 0 点赞 0 评论 0
def test_dataframe_sub_frame_data_types():
    s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']),
                      pd.Series([4, 5, 6], index=['d', 'e', 'g'])])
    s2 = XSeries([1, 2, 3])
    s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}])
    s4 = XSeries(['f', 's', 't'])

    df = XDataFrame({
        'first_col': s1,
        'second_col': s2,
        'third_col': s3,
        'fourth_col': s4
    })

    sub_df = df.loc[:2]

    assert type(sub_df) == XDataFrame
    assert sub_df['first_col'].data_type == pd.Series
    assert sub_df['second_col'].data_type == np.int64
    assert sub_df['third_col'].data_type == dict
    assert sub_df['fourth_col'].data_type == str

    assert type(sub_df[['first_col']]) == XDataFrame
    assert type(sub_df[['first_col', 'second_col']]) == XDataFrame
test_data_type.py 文件源码 项目:xpandas 作者: alan-turing-institute 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def test_series_replace_element():
    s = XSeries([
        pd.Series([1, 2, 3], index=['a', 'b', 'c']),
        pd.Series([4, 5, 6], index=['d', 'e', 'g'])
    ], name='MySuperSeries')

    try:
        s[0] = 111
        assert False
    except:
        assert True

    try:
        s[0] = pd.Series(np.random.normal(size=100))
        assert True
    except:
        assert False
test_dataframe_transformer.py 文件源码 项目:xpandas 作者: alan-turing-institute 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def test_naming():
    X = XSeries([
        pd.Series(np.random.normal(0, 1, 100), name='X')
    ])
    df = XDataFrame({
        'X': X
    })

    dataframe_transformer = XDataFrameTransformer({
        'X': [TimeSeriesTransformer()]
    })

    dataframe_transformer.fit(df)
    transformed_df = dataframe_transformer.transform(df)

    for col_name in transformed_df.columns:
        assert col_name.startswith('X_TimeSeriesTransformer')
test_dataframe_transformer.py 文件源码 项目:xpandas 作者: alan-turing-institute 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def test_multiple_transformers_for_one_column():
    X = XSeries([
        pd.Series(np.random.normal(0, 1, 100), name='X')
    ])
    df = XDataFrame({
        'X': X
    })

    dataframe_transformer = XDataFrameTransformer({
        'X': [TimeSeriesTransformer(), IdentityTransformer(), MeanSeriesTransformer()]
    })

    dataframe_transformer.fit(df)
    transformed_df = dataframe_transformer.transform(df)

    for col_name in transformed_df.columns:
        assert col_name.startswith('X_TimeSeriesTransformer') or \
               col_name.startswith('X_IdentityTransformer') or \
               col_name.startswith('X_MeanSeriesTransformer')
test_ts_fresh.py 文件源码 项目:xpandas 作者: alan-turing-institute 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def test_ts_fresh_chain():
    s1 = XSeries([
        pd.Series(np.random.normal(0, 1, 20))
        for _ in range(10)
    ], name='X')

    pipe = PipeLineChain([
        ('mean shift', TimeSeriesWindowTransformer()),
        ('ts fresh step', TsFreshSeriesTransformer())
    ])

    pipe.fit(s1)
    transformed_df = pipe.transform(s1)

    # print(transformed_df.head())

    assert type(transformed_df) == XDataFrame
test_munge.py 文件源码 项目:zipline-chinese 作者: zhanghan1990 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def test_bfill(self):
        # test ndim=1
        N = 100
        s = pd.Series(np.random.randn(N))
        mask = random.sample(range(N), 10)
        s.iloc[mask] = np.nan

        correct = s.bfill().values
        test = bfill(s.values)
        assert_almost_equal(correct, test)

        # test ndim=2
        df = pd.DataFrame(np.random.randn(N, N))
        df.iloc[mask] = np.nan
        correct = df.bfill().values
        test = bfill(df.values)
        assert_almost_equal(correct, test)
test_munge.py 文件源码 项目:zipline-chinese 作者: zhanghan1990 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def test_ffill(self):
        # test ndim=1
        N = 100
        s = pd.Series(np.random.randn(N))
        mask = random.sample(range(N), 10)
        s.iloc[mask] = np.nan

        correct = s.ffill().values
        test = ffill(s.values)
        assert_almost_equal(correct, test)

        # test ndim=2
        df = pd.DataFrame(np.random.randn(N, N))
        df.iloc[mask] = np.nan
        correct = df.ffill().values
        test = ffill(df.values)
        assert_almost_equal(correct, test)
test_events.py 文件源码 项目:zipline-chinese 作者: zhanghan1990 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def test_conversion_to_df(self, df, infer_timestamps):

        events_by_sid = {0: df}
        loader = EventDataSetLoader(
            dtx,
            events_by_sid,
            infer_timestamps=infer_timestamps,
        )
        self.assertEqual(
            loader.events_by_sid.keys(),
            events_by_sid.keys(),
        )

        if infer_timestamps:
            expected = pd.Series(index=[dtx[0]] * 10, data=dtx,
                                 name=ANNOUNCEMENT_FIELD_NAME)
        else:
            expected = pd.Series(index=dtx, data=dtx,
                                 name=ANNOUNCEMENT_FIELD_NAME)
            expected.index.name = TS_FIELD_NAME
        # Check that index by first given date has been added
        assert_series_equal(
            loader.events_by_sid[0][ANNOUNCEMENT_FIELD_NAME],
            expected,
        )
data_fetcher.py 文件源码 项目:scikit-dataaccess 作者: MITHaystack 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def getAntennaLogs():
        '''
        Retrieve information about antenna changes

        @return dictionary of antenna changes
        '''
        store_location = data_util.getDataLocation('ngl_gps')
        store = pd.HDFStore(store_location, 'r')
        logs_df = store['ngl_steps']
        store.close()

        metadata = DataFetcher.getStationMetadata()

        logs_dict = OrderedDict()

        for station in metadata.index:
            offset_dates = logs_df[logs_df['Station']==station].index.unique()
            offset_dates = pd.Series(offset_dates)
            logs_dict[station] = offset_dates

        return logs_dict
operate_load_poi_data.py 文件源码 项目:didi_competition 作者: Heipiao 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def remove_error_poi_each_line(line_data):
    ## from 1 to len(..), because the first one is district hash
    ### why I need a temp_line_data here!!!!
    ### Please see the property of the remove() function

    standard_style = re.compile(r"\d+#\d+:\d+")

    line_data = list(line_data[0])
    temp_line_data = line_data.copy()
    for poi_in_line in temp_line_data:
        if len(poi_in_line) == 32: # this is the district hash
            continue
        if not re.match(standard_style, poi_in_line):
            #print(poi_in_line)
            line_data.remove(poi_in_line)
    return pd.Series([line_data])

# the input line_data is a serise!!
test_core.py 文件源码 项目:dask_gdf 作者: gpuopenanalytics 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def test_series_append():
    np.random.seed(0)

    n = 1000
    df = pd.DataFrame({'x': np.random.randint(0, 5, size=n),
                       'y': np.random.normal(size=n)})

    gdf = gd.DataFrame.from_pandas(df)
    frags = _fragmented_gdf(gdf, nsplit=13)

    frags = [df.x for df in frags]

    appending = dgd.from_pygdf(frags[0], npartitions=1)
    for frag in frags[1:]:
        appending = appending.append(frag)

    appended = appending.compute().to_pandas()
    assert isinstance(appended, pd.Series)
    np.testing.assert_array_equal(appended, df.x)
test_core.py 文件源码 项目:dask_gdf 作者: gpuopenanalytics 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def test_take(nelem, nparts):
    np.random.seed(0)

    # # Use unique index range as the sort may not be stable-ordering
    x = np.random.randint(0, nelem, size=nelem)
    y = np.random.random(nelem)

    selected = np.random.randint(0, nelem - 1, size=nelem // 2)

    df = pd.DataFrame({'x': x, 'y': y})

    ddf = dd.from_pandas(df, npartitions=nparts)
    dgdf = dgd.from_dask_dataframe(ddf)
    out = dgdf.take(gd.Series(selected), npartitions=5)
    got = out.compute().to_pandas()

    expect = df.take(selected)
    assert 1 < out.npartitions <= 5
    np.testing.assert_array_equal(got.index, np.arange(len(got)))
    np.testing.assert_array_equal(got.x, expect.x)
    np.testing.assert_array_equal(got.y, expect.y)
core.py 文件源码 项目:dask_gdf 作者: gpuopenanalytics 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def set_index(self, index, drop=True, sorted=False):
        """Set new index.

        Parameters
        ----------
        index : str or Series
            If a ``str`` is provided, it is used as the name of the
            column to be made into the index.
            If a ``Series`` is provided, it is used as the new index
        drop : bool
            Whether the first original index column is dropped.
        sorted : bool
            Whether the new index column is already sorted.
        """
        if not drop:
            raise NotImplementedError('drop=False not supported yet')

        if isinstance(index, str):
            return self._set_index_raw(index, drop=drop, sorted=sorted)
        elif isinstance(index, Series):
            indexname = '__dask_gdf.index'
            df = self.assign(**{indexname: index})
            return df._set_index_raw(indexname, drop=drop, sorted=sorted)
        else:
            raise TypeError('cannot set_index from {}'.format(type(index)))
actor.py 文件源码 项目:saapy 作者: ashapochka 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def connect_actors(actor_frame, connectivity_sets, connectivity_column):
    """
    :param actor_frame:
    :param connectivity_sets:
    :param connectivity_column:
    :return:

    Examples:

    same_actors = {
        'ccason': [3, 14, 15], 'clipka': [4, 5, 13],
        'wfpokorny': [11, 17], 'anshuarya': [0],
        'bentsm': [1], 'cbarton': [2], 'dbodor': [6],
        'jlecher': [7], 'jgrimbert': [8], 'nalvarez': [9],
        'selvik': [10], 'wverhelst': [12], 'gryken': [16],
        'github': [18]}
    actor_frame = connect_actors(actor_frame, same_actors, 'actor_id')
    """
    connectivity = {}
    for actor_id, connectivity_set in connectivity_sets.items():
        for actor in connectivity_set:
            connectivity[actor] = actor_id
    actor_frame[connectivity_column] = su.categorize(pd.Series(connectivity))
    return actor_frame
git_authorship.py 文件源码 项目:saapy 作者: ashapochka 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def _compute_author_similarity(self, paired_authors):
        def row_similarity(row):
            same_email = row.author_email == row.author_email_other
            name_similarity = fuzz.token_set_ratio(row.author_name,
                                                   row.author_name_other)
            email_name_similarity = fuzz.ratio(row.email_name,
                                               row.email_name_other)
            name_to_email_similarity = fuzz.token_set_ratio(row.author_name,
                                                            row.name_from_email_other)
            return pd.Series(
                [same_email, name_similarity, email_name_similarity,
                 name_to_email_similarity])

        newcols = paired_authors.apply(row_similarity, axis=1)
        newcols.columns = ['same_email', 'name_similarity',
                           'email_name_similarity', 'name_to_email_similarity']
        newdf = paired_authors.join(newcols)
        return newdf
QAIndicator_Series.py 文件源码 项目:QUANTAXIS 作者: yutiansut 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def SMA(Series, N, M=1):

    ret = []
    i = 1
    length = len(Series)
    # ??X????? nan ?
    while i < length:
        if np.isnan(Series[i]):
            i += 1
        else:
            break
    preY = Series[i]  # Y'
    ret.append(preY)
    while i < length:
        Y = (M * Series[i] + (N - M) * preY) / float(N)
        ret.append(Y)
        preY = Y
        i += 1
    return pd.Series(ret)
indicators.py 文件源码 项目:QUANTAXIS 作者: yutiansut 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def QA_indicator_dpo(data, N=20, M=6):
    """
    ????
    ???????????????????????????????????????????
    ???
    ????????????????????
    ???DPO????????????????????????????????????????????


    ???????????????????????????????
    ?20????????10?????????????
    ????????????????????????????????????????
    ?????????????????????????????0?????????????????????0??????
    ???????????????????????
    """
    _dpo = pd.Series(data) - pd.Series(data).rolling(N / 2 + 1).mean()
    _madpo = pd.Series(_dpo).rolling(M).mean()
    return _dpo, _madpo
popuanalysis.py 文件源码 项目:NeoAnalysis 作者: neoanalysis 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def stats_desc(self,store_key,cond):
        '''
        Args
            store_key (string):
                define which data to be analyzed in the workspace
            cond (string):
                sample observation
        Returns
            descriptive statistics
        '''
        datas = list()
        for ite_file in self.store.keys():
            datas.append(self.store[ite_file][store_key][str(cond)]['mean'].value)
        datas = pd.Series(datas)
        return datas.describe()

    # one way ANOVA
    # for scalar value usage only
graphics.py 文件源码 项目:NeoAnalysis 作者: neoanalysis 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def df_add(self,column,added_info):
        '''
        Args
            column (string):
                the column name to be played with
            added_info (string, int, float or pandas.DataFrame):
                The information to be added to the selected column can be string, int, float, or 
                pandas.DataFrame
        Returns
            -
        '''
        if isinstance(added_info,str):
            self.data_df[column] = self.data_df[column] + self.data_df[added_info]
        elif isinstance(added_info,(int,float)):
            self.data_df[column] = self.data_df[column] + added_info
        elif isinstance(added_info,(pd.Series,pd.DataFrame)):
            self.data_df[column] = self.data_df[column] + added_info

    # This function performs minus to a given column
graphics.py 文件源码 项目:NeoAnalysis 作者: neoanalysis 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def df_minus(self,column,minus_info):
        '''
        Args
            column (string):
                the column name to be played with
            minus_info (string, int, float or pandas.DataFrame):
                information to be subtracted from the selected column
        Returns
            -
        '''
        if isinstance(minus_info,str):
            self.data_df[column] = self.data_df[column] - self.data_df[minus_info]
        elif isinstance(minus_info,(int,float)):
            self.data_df[column] = self.data_df[column] - minus_info
        elif isinstance(added_info,(pd.Series,pd.DataFrame)):
            self.data_df[column] = self.data_df[column] - added_info

    # This function multiplys the selected column with certain factor
graphics.py 文件源码 项目:NeoAnalysis 作者: neoanalysis 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def df_multiply(self,column,multiply_info):
        '''
        Args
            column (string):
                the column name to be played with
            multiply_info (string, int, float or pandas.DataFrame):
                information to be used for multiplying
        Returns
            -
        '''
        if isinstance(multiply_info,str):
            self.data_df[column] = self.data_df[column] * self.data_df[multiply_info]
        elif isinstance(multiply_info,(int,float)):
            self.data_df[column] = self.data_df[column] * multiply_info
        elif isinstance(added_info,(pd.Series,pd.DataFrame)):
            self.data_df[column] = self.data_df[column] * added_info

    # This function divides the selected column by certain factor
graphics.py 文件源码 项目:NeoAnalysis 作者: neoanalysis 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def df_division(self,column,division_info):
        '''
        Args
            column (string):
                the column name to be played with
            division_info (string, int, float or pandas.DataFrame):
                information to be used for dividing
        Returns
            -
        '''
        if isinstance(division_info,str):
            self.data_df[column] = self.data_df[column] / self.data_df[division_info]
        elif isinstance(division_info,(int,float)):
            self.data_df[column] = self.data_df[column] / float(division_info)
        elif isinstance(added_info,(pd.Series,pd.DataFrame)):
            self.data_df[column] = self.data_df[column] / added_info

    # delete certain trials in the data table
popuanalysis.py 文件源码 项目:NeoAnalysis 作者: neoanalysis 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def stats_desc(self,store_key,cond):
        '''
        Args
            store_key (string):
                define which data to be analyzed in the workspace
            cond (string):
                sample observation
        Returns
            descriptive statistics
        '''
        datas = list()
        for ite_file in list(self.store.keys()):
            datas.append(self.store[ite_file][store_key][str(cond)]['mean'].value)
        datas = pd.Series(datas)
        return datas.describe()

    # one way ANOVA
    # for scalar value usage only


问题


面经


文章

微信
公众号

扫码关注公众号