def aggregate_ohlcv_panel(self,
fields,
ohlcv_panel,
items=None,
minor_axis=None):
"""
Convert an OHLCV Panel into a DataFrame by aggregating each field's
frame into a Series.
"""
vals = ohlcv_panel
if isinstance(ohlcv_panel, pd.Panel):
vals = ohlcv_panel.values
items = ohlcv_panel.items
minor_axis = ohlcv_panel.minor_axis
data = [
self.frame_to_series(
field,
vals[items.get_loc(field)],
minor_axis
)
for field in fields
]
return np.array(data)
python类Series()的实例源码
def to_series(tuples):
"""Transforms a list of tuples of the form (date, count) in to a pandas
series indexed by dt.
"""
cleaned_time_val_tuples = [tuple for tuple in tuples if not (
tuple[0] is pd.NaT or tuple[1] is None)]
if len(cleaned_time_val_tuples) > 0:
# change list of tuples ie [(a1, b1), (a2, b2), ...] into
# tuple of lists ie ([a1, a2, ...], [b1, b2, ...])
unzipped_cleaned_time_values = zip(*cleaned_time_val_tuples)
# just being explicit about what these are
counts = unzipped_cleaned_time_values[1]
timestamps = unzipped_cleaned_time_values[0]
# Create the series with a sorted index.
ret_val = pd.Series(counts, index=timestamps).sort_index()
else:
ret_val = None
return ret_val
# In[ ]:
def __init__(self, *args, **kwargs):
'''
The same arguments as for pandas.Series
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.Series.html
In order to create XSeries of any data_type, data argument must be a pythons list.
For example, to create XSeries of pandas.Series, pass data should be
data = [s_1, s2, ..., s3] where s_i is a instance of pandas.Series.
'''
super(XSeries, self).__init__(*args, **kwargs)
data = kwargs.get('data')
if data is None:
data = args[0]
check_result, data_type = _check_all_elements_have_the_same_property(data, type)
if not check_result:
raise ValueError('Not all elements the same type')
if data_type is not None:
self._data_type = data_type
else:
self._data_type = type(data._values[0])
bag_of_features_transformer.py 文件源码
项目:xpandas
作者: alan-turing-institute
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def __init__(self, dictionary=None, **kwargs):
'''
:param dictionary: custom dictionary to count against. if None, calculate dictionary from dataset
'''
self.dictionary = dictionary
accepted_types = [
pd.Series, list, np.array, tuple
]
def bag_of_words_transform_function(corpus):
counter = Counter(corpus)
for el in self.dictionary:
if counter.get(el) is None:
counter[el] = 0
return counter
super(BagOfWordsTransformer, self).__init__(data_types=accepted_types,
columns=None,
transform_function=bag_of_words_transform_function)
def test_mean_transformer():
s1 = XSeries([
pd.Series(np.random.normal(size=10)),
pd.Series(np.random.normal(size=15))
])
s2 = XSeries([
pd.Series(np.random.normal(size=10)),
pd.Series(np.random.normal(size=15)),
pd.Series(np.random.normal(size=100))
])
tr = MeanSeriesTransformer()
tr = tr.fit(s1)
transformed_s = tr.transform(s2)
assert transformed_s.shape[0] == 3
assert type(transformed_s) == XSeries
def test_mean_transformer_data_frame():
s1 = XSeries([
pd.Series(np.random.normal(size=10)),
pd.Series(np.random.normal(size=15))
])
s2 = XSeries([
pd.Series(np.random.normal(size=10)),
pd.Series(np.random.normal(size=15))
])
df = XDataFrame({
's1': s1,
's2': s2
})
tr = MeanSeriesTransformer()
try:
tr = tr.fit(df)
assert False
except:
assert True
def test_dataframe_data_types():
s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']),
pd.Series([4, 5, 6], index=['d', 'e', 'g'])])
s2 = XSeries([1, 2, 3])
s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}])
s4 = XSeries(['f', 's', 't'])
df = XDataFrame({
'first_col': s1,
'second_col': s2,
'third_col': s3,
'fourth_col': s4
})
assert df['first_col'].data_type == pd.Series
assert df['second_col'].data_type == np.int64
assert df['third_col'].data_type == dict
assert df['fourth_col'].data_type == str
assert type(df[['first_col']]) == XDataFrame
assert type(df[['first_col', 'second_col']]) == XDataFrame
def test_dataframe_sub_frame_data_types():
s1 = XSeries([pd.Series([1, 2, 3], index=['a', 'b', 'c']),
pd.Series([4, 5, 6], index=['d', 'e', 'g'])])
s2 = XSeries([1, 2, 3])
s3 = XSeries([{"k1": "v1"}, {"k2": 'v2'}])
s4 = XSeries(['f', 's', 't'])
df = XDataFrame({
'first_col': s1,
'second_col': s2,
'third_col': s3,
'fourth_col': s4
})
sub_df = df.loc[:2]
assert type(sub_df) == XDataFrame
assert sub_df['first_col'].data_type == pd.Series
assert sub_df['second_col'].data_type == np.int64
assert sub_df['third_col'].data_type == dict
assert sub_df['fourth_col'].data_type == str
assert type(sub_df[['first_col']]) == XDataFrame
assert type(sub_df[['first_col', 'second_col']]) == XDataFrame
def test_series_replace_element():
s = XSeries([
pd.Series([1, 2, 3], index=['a', 'b', 'c']),
pd.Series([4, 5, 6], index=['d', 'e', 'g'])
], name='MySuperSeries')
try:
s[0] = 111
assert False
except:
assert True
try:
s[0] = pd.Series(np.random.normal(size=100))
assert True
except:
assert False
test_dataframe_transformer.py 文件源码
项目:xpandas
作者: alan-turing-institute
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def test_naming():
X = XSeries([
pd.Series(np.random.normal(0, 1, 100), name='X')
])
df = XDataFrame({
'X': X
})
dataframe_transformer = XDataFrameTransformer({
'X': [TimeSeriesTransformer()]
})
dataframe_transformer.fit(df)
transformed_df = dataframe_transformer.transform(df)
for col_name in transformed_df.columns:
assert col_name.startswith('X_TimeSeriesTransformer')
test_dataframe_transformer.py 文件源码
项目:xpandas
作者: alan-turing-institute
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def test_multiple_transformers_for_one_column():
X = XSeries([
pd.Series(np.random.normal(0, 1, 100), name='X')
])
df = XDataFrame({
'X': X
})
dataframe_transformer = XDataFrameTransformer({
'X': [TimeSeriesTransformer(), IdentityTransformer(), MeanSeriesTransformer()]
})
dataframe_transformer.fit(df)
transformed_df = dataframe_transformer.transform(df)
for col_name in transformed_df.columns:
assert col_name.startswith('X_TimeSeriesTransformer') or \
col_name.startswith('X_IdentityTransformer') or \
col_name.startswith('X_MeanSeriesTransformer')
def test_ts_fresh_chain():
s1 = XSeries([
pd.Series(np.random.normal(0, 1, 20))
for _ in range(10)
], name='X')
pipe = PipeLineChain([
('mean shift', TimeSeriesWindowTransformer()),
('ts fresh step', TsFreshSeriesTransformer())
])
pipe.fit(s1)
transformed_df = pipe.transform(s1)
# print(transformed_df.head())
assert type(transformed_df) == XDataFrame
def test_bfill(self):
# test ndim=1
N = 100
s = pd.Series(np.random.randn(N))
mask = random.sample(range(N), 10)
s.iloc[mask] = np.nan
correct = s.bfill().values
test = bfill(s.values)
assert_almost_equal(correct, test)
# test ndim=2
df = pd.DataFrame(np.random.randn(N, N))
df.iloc[mask] = np.nan
correct = df.bfill().values
test = bfill(df.values)
assert_almost_equal(correct, test)
def test_ffill(self):
# test ndim=1
N = 100
s = pd.Series(np.random.randn(N))
mask = random.sample(range(N), 10)
s.iloc[mask] = np.nan
correct = s.ffill().values
test = ffill(s.values)
assert_almost_equal(correct, test)
# test ndim=2
df = pd.DataFrame(np.random.randn(N, N))
df.iloc[mask] = np.nan
correct = df.ffill().values
test = ffill(df.values)
assert_almost_equal(correct, test)
def test_conversion_to_df(self, df, infer_timestamps):
events_by_sid = {0: df}
loader = EventDataSetLoader(
dtx,
events_by_sid,
infer_timestamps=infer_timestamps,
)
self.assertEqual(
loader.events_by_sid.keys(),
events_by_sid.keys(),
)
if infer_timestamps:
expected = pd.Series(index=[dtx[0]] * 10, data=dtx,
name=ANNOUNCEMENT_FIELD_NAME)
else:
expected = pd.Series(index=dtx, data=dtx,
name=ANNOUNCEMENT_FIELD_NAME)
expected.index.name = TS_FIELD_NAME
# Check that index by first given date has been added
assert_series_equal(
loader.events_by_sid[0][ANNOUNCEMENT_FIELD_NAME],
expected,
)
def getAntennaLogs():
'''
Retrieve information about antenna changes
@return dictionary of antenna changes
'''
store_location = data_util.getDataLocation('ngl_gps')
store = pd.HDFStore(store_location, 'r')
logs_df = store['ngl_steps']
store.close()
metadata = DataFetcher.getStationMetadata()
logs_dict = OrderedDict()
for station in metadata.index:
offset_dates = logs_df[logs_df['Station']==station].index.unique()
offset_dates = pd.Series(offset_dates)
logs_dict[station] = offset_dates
return logs_dict
def remove_error_poi_each_line(line_data):
## from 1 to len(..), because the first one is district hash
### why I need a temp_line_data here!!!!
### Please see the property of the remove() function
standard_style = re.compile(r"\d+#\d+:\d+")
line_data = list(line_data[0])
temp_line_data = line_data.copy()
for poi_in_line in temp_line_data:
if len(poi_in_line) == 32: # this is the district hash
continue
if not re.match(standard_style, poi_in_line):
#print(poi_in_line)
line_data.remove(poi_in_line)
return pd.Series([line_data])
# the input line_data is a serise!!
def test_series_append():
np.random.seed(0)
n = 1000
df = pd.DataFrame({'x': np.random.randint(0, 5, size=n),
'y': np.random.normal(size=n)})
gdf = gd.DataFrame.from_pandas(df)
frags = _fragmented_gdf(gdf, nsplit=13)
frags = [df.x for df in frags]
appending = dgd.from_pygdf(frags[0], npartitions=1)
for frag in frags[1:]:
appending = appending.append(frag)
appended = appending.compute().to_pandas()
assert isinstance(appended, pd.Series)
np.testing.assert_array_equal(appended, df.x)
def test_take(nelem, nparts):
np.random.seed(0)
# # Use unique index range as the sort may not be stable-ordering
x = np.random.randint(0, nelem, size=nelem)
y = np.random.random(nelem)
selected = np.random.randint(0, nelem - 1, size=nelem // 2)
df = pd.DataFrame({'x': x, 'y': y})
ddf = dd.from_pandas(df, npartitions=nparts)
dgdf = dgd.from_dask_dataframe(ddf)
out = dgdf.take(gd.Series(selected), npartitions=5)
got = out.compute().to_pandas()
expect = df.take(selected)
assert 1 < out.npartitions <= 5
np.testing.assert_array_equal(got.index, np.arange(len(got)))
np.testing.assert_array_equal(got.x, expect.x)
np.testing.assert_array_equal(got.y, expect.y)
def set_index(self, index, drop=True, sorted=False):
"""Set new index.
Parameters
----------
index : str or Series
If a ``str`` is provided, it is used as the name of the
column to be made into the index.
If a ``Series`` is provided, it is used as the new index
drop : bool
Whether the first original index column is dropped.
sorted : bool
Whether the new index column is already sorted.
"""
if not drop:
raise NotImplementedError('drop=False not supported yet')
if isinstance(index, str):
return self._set_index_raw(index, drop=drop, sorted=sorted)
elif isinstance(index, Series):
indexname = '__dask_gdf.index'
df = self.assign(**{indexname: index})
return df._set_index_raw(indexname, drop=drop, sorted=sorted)
else:
raise TypeError('cannot set_index from {}'.format(type(index)))
def connect_actors(actor_frame, connectivity_sets, connectivity_column):
"""
:param actor_frame:
:param connectivity_sets:
:param connectivity_column:
:return:
Examples:
same_actors = {
'ccason': [3, 14, 15], 'clipka': [4, 5, 13],
'wfpokorny': [11, 17], 'anshuarya': [0],
'bentsm': [1], 'cbarton': [2], 'dbodor': [6],
'jlecher': [7], 'jgrimbert': [8], 'nalvarez': [9],
'selvik': [10], 'wverhelst': [12], 'gryken': [16],
'github': [18]}
actor_frame = connect_actors(actor_frame, same_actors, 'actor_id')
"""
connectivity = {}
for actor_id, connectivity_set in connectivity_sets.items():
for actor in connectivity_set:
connectivity[actor] = actor_id
actor_frame[connectivity_column] = su.categorize(pd.Series(connectivity))
return actor_frame
def _compute_author_similarity(self, paired_authors):
def row_similarity(row):
same_email = row.author_email == row.author_email_other
name_similarity = fuzz.token_set_ratio(row.author_name,
row.author_name_other)
email_name_similarity = fuzz.ratio(row.email_name,
row.email_name_other)
name_to_email_similarity = fuzz.token_set_ratio(row.author_name,
row.name_from_email_other)
return pd.Series(
[same_email, name_similarity, email_name_similarity,
name_to_email_similarity])
newcols = paired_authors.apply(row_similarity, axis=1)
newcols.columns = ['same_email', 'name_similarity',
'email_name_similarity', 'name_to_email_similarity']
newdf = paired_authors.join(newcols)
return newdf
def SMA(Series, N, M=1):
ret = []
i = 1
length = len(Series)
# ??X????? nan ?
while i < length:
if np.isnan(Series[i]):
i += 1
else:
break
preY = Series[i] # Y'
ret.append(preY)
while i < length:
Y = (M * Series[i] + (N - M) * preY) / float(N)
ret.append(Y)
preY = Y
i += 1
return pd.Series(ret)
def QA_indicator_dpo(data, N=20, M=6):
"""
????
???????????????????????????????????????????
???
????????????????????
???DPO????????????????????????????????????????????
???????????????????????????????
?20????????10?????????????
????????????????????????????????????????
?????????????????????????????0?????????????????????0??????
???????????????????????
"""
_dpo = pd.Series(data) - pd.Series(data).rolling(N / 2 + 1).mean()
_madpo = pd.Series(_dpo).rolling(M).mean()
return _dpo, _madpo
def stats_desc(self,store_key,cond):
'''
Args
store_key (string):
define which data to be analyzed in the workspace
cond (string):
sample observation
Returns
descriptive statistics
'''
datas = list()
for ite_file in self.store.keys():
datas.append(self.store[ite_file][store_key][str(cond)]['mean'].value)
datas = pd.Series(datas)
return datas.describe()
# one way ANOVA
# for scalar value usage only
def df_add(self,column,added_info):
'''
Args
column (string):
the column name to be played with
added_info (string, int, float or pandas.DataFrame):
The information to be added to the selected column can be string, int, float, or
pandas.DataFrame
Returns
-
'''
if isinstance(added_info,str):
self.data_df[column] = self.data_df[column] + self.data_df[added_info]
elif isinstance(added_info,(int,float)):
self.data_df[column] = self.data_df[column] + added_info
elif isinstance(added_info,(pd.Series,pd.DataFrame)):
self.data_df[column] = self.data_df[column] + added_info
# This function performs minus to a given column
def df_minus(self,column,minus_info):
'''
Args
column (string):
the column name to be played with
minus_info (string, int, float or pandas.DataFrame):
information to be subtracted from the selected column
Returns
-
'''
if isinstance(minus_info,str):
self.data_df[column] = self.data_df[column] - self.data_df[minus_info]
elif isinstance(minus_info,(int,float)):
self.data_df[column] = self.data_df[column] - minus_info
elif isinstance(added_info,(pd.Series,pd.DataFrame)):
self.data_df[column] = self.data_df[column] - added_info
# This function multiplys the selected column with certain factor
def df_multiply(self,column,multiply_info):
'''
Args
column (string):
the column name to be played with
multiply_info (string, int, float or pandas.DataFrame):
information to be used for multiplying
Returns
-
'''
if isinstance(multiply_info,str):
self.data_df[column] = self.data_df[column] * self.data_df[multiply_info]
elif isinstance(multiply_info,(int,float)):
self.data_df[column] = self.data_df[column] * multiply_info
elif isinstance(added_info,(pd.Series,pd.DataFrame)):
self.data_df[column] = self.data_df[column] * added_info
# This function divides the selected column by certain factor
def df_division(self,column,division_info):
'''
Args
column (string):
the column name to be played with
division_info (string, int, float or pandas.DataFrame):
information to be used for dividing
Returns
-
'''
if isinstance(division_info,str):
self.data_df[column] = self.data_df[column] / self.data_df[division_info]
elif isinstance(division_info,(int,float)):
self.data_df[column] = self.data_df[column] / float(division_info)
elif isinstance(added_info,(pd.Series,pd.DataFrame)):
self.data_df[column] = self.data_df[column] / added_info
# delete certain trials in the data table
def stats_desc(self,store_key,cond):
'''
Args
store_key (string):
define which data to be analyzed in the workspace
cond (string):
sample observation
Returns
descriptive statistics
'''
datas = list()
for ite_file in list(self.store.keys()):
datas.append(self.store[ite_file][store_key][str(cond)]['mean'].value)
datas = pd.Series(datas)
return datas.describe()
# one way ANOVA
# for scalar value usage only