def aggregate_ohlcv_panel(self,
fields,
ohlcv_panel,
items=None,
minor_axis=None):
"""
Convert an OHLCV Panel into a DataFrame by aggregating each field's
frame into a Series.
"""
vals = ohlcv_panel
if isinstance(ohlcv_panel, pd.Panel):
vals = ohlcv_panel.values
items = ohlcv_panel.items
minor_axis = ohlcv_panel.minor_axis
data = [
self.frame_to_series(
field,
vals[items.get_loc(field)],
minor_axis
)
for field in fields
]
return np.array(data)
python类DataFrame()的实例源码
def y_sum_by_time(x_arr, y_arr, top=None):
df = pd.DataFrame({'Timestamp': pd.to_datetime(x_arr, unit='s'), 'Status': y_arr})
df['Date'] = df['Timestamp'].apply(lambda x: "%d/%d/%d" % (x.day, x.month, x.year))
df['Hour'] = df['Timestamp'].apply(lambda x: "%d" % (x.hour))
df['Weekday'] = df['Timestamp'].apply(lambda x: "%s" % (x.weekday_name))
times = ['Hour', 'Weekday', 'Date']
result = {}
for groupby in times:
df_group = df.groupby(groupby, as_index=False).agg({'Status': np.sum})
if top != None and top > 0:
#df_group = df_group.nlargest(top, 'Status').sort(['Status', 'Hour'],ascending=False)
idx = df_group.nlargest(top, 'Status') > 0
else:
idx = df_group['Status'].max() == df_group['Status']
result[groupby] = {k: g['Status'].replace(np.nan, 'None').tolist() for k,g in df_group[idx].groupby(groupby)}
return result
def get_citation_df(args, text):
"""
Generate citation_df and save it to 'citations.tsv'.
"""
citation_df = pandas.DataFrame(
{'string': get_citation_strings(text)}
)
if args.citation_tags_path.is_file():
tag_df = pandas.read_table(args.citation_tags_path)
tag_df['string'] = '@tag:' + tag_df.tag
for citation in tag_df.citation:
is_valid_citation_string('@' + citation)
citation_df = citation_df.merge(tag_df[['string', 'citation']], how='left')
else:
citation_df['citation'] = None
logging.info(f'missing {args.citation_tags_path} file: no citation tags set')
citation_df.citation.fillna(citation_df.string.astype(str).str.lstrip('@'), inplace=True)
citation_df['standard_citation'] = citation_df.citation.map(standardize_citation)
citation_df['citation_id'] = citation_df.standard_citation.map(get_citation_id)
citation_df = citation_df.sort_values(['standard_citation', 'citation'])
citation_df.to_csv(args.citations_path, sep='\t', index=False)
check_collisions(citation_df)
check_multiple_citation_strings(citation_df)
return citation_df
def do_work_pso(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
output = pd.DataFrame(population[item].position)
output.columns = ['Split']
dataSplit = pd.concat([data, output], axis=1)
f1 = []
results = []
for i in range(nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
# print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_ga(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
output = pd.DataFrame(population[item].genes)
output.columns = ['Split']
dataSplit = pd.concat([data, output], axis=1)
f1 = []
results = []
for i in range(nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
return (1 / np.sum(f1))
# Main
def rhoA(self):
# rhoA
rhoA = pd.DataFrame(0, index=np.arange(1), columns=self.latent)
for i in range(self.lenlatent):
weights = pd.DataFrame(self.outer_weights[self.latent[i]])
weights = weights[(weights.T != 0).any()]
result = pd.DataFrame.dot(weights.T, weights)
result_ = pd.DataFrame.dot(weights, weights.T)
S = self.data_[self.Variables['measurement'][
self.Variables['latent'] == self.latent[i]]]
S = pd.DataFrame.dot(S.T, S) / S.shape[0]
numerador = (
np.dot(np.dot(weights.T, (S - np.diag(np.diag(S)))), weights))
denominador = (
(np.dot(np.dot(weights.T, (result_ - np.diag(np.diag(result_)))), weights)))
rhoA_ = ((result)**2) * (numerador / denominador)
if(np.isnan(rhoA_.values)):
rhoA[self.latent[i]] = 1
else:
rhoA[self.latent[i]] = rhoA_.values
return rhoA.T
def xloads(self):
# Xloadings
A = self.data_.transpose().values
B = self.fscores.transpose().values
A_mA = A - A.mean(1)[:, None]
B_mB = B - B.mean(1)[:, None]
ssA = (A_mA**2).sum(1)
ssB = (B_mB**2).sum(1)
xloads_ = (np.dot(A_mA, B_mB.T) /
np.sqrt(np.dot(ssA[:, None], ssB[None])))
xloads = pd.DataFrame(
xloads_, index=self.manifests, columns=self.latent)
return xloads
def alpha(self):
# Cronbach Alpha
alpha = pd.DataFrame(0, index=np.arange(1), columns=self.latent)
for i in range(self.lenlatent):
block = self.data_[self.Variables['measurement']
[self.Variables['latent'] == self.latent[i]]]
p = len(block.columns)
if(p != 1):
p_ = len(block)
correction = np.sqrt((p_ - 1) / p_)
soma = np.var(np.sum(block, axis=1))
cor_ = pd.DataFrame.corr(block)
denominador = soma * correction**2
numerador = 2 * np.sum(np.tril(cor_) - np.diag(np.diag(cor_)))
alpha_ = (numerador / denominador) * (p / (p - 1))
alpha[self.latent[i]] = alpha_
else:
alpha[self.latent[i]] = 1
return alpha.T
def do_work_pso(data, LVcsv, Mcsv, scheme, reg, h, maximo):
output = pd.DataFrame(population[item].position)
output.columns = ['Split']
dataSplit = pd.concat([data, output], axis=1)
f1 = []
results = []
for i in range(nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_ga(self, item):
output = pd.DataFrame(self.population[item].genes)
output.columns = ['Split']
dataSplit = pd.concat([self.data, output], axis=1)
f1 = []
results = []
for i in range(self.nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
self.reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_pso(self, item):
output = pd.DataFrame(self.population[item].position)
output.columns = ['Split']
dataSplit = pd.concat([self.data, output], axis=1)
f1 = []
results = []
for i in range(self.nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
self.reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_tabu(self, item):
output = pd.DataFrame(self.population[item])
output.columns = ['Split']
dataSplit = pd.concat([self.data, output], axis=1)
f1 = []
results = []
for i in range(self.nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
self.reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
cost = (np.sum(f1))
print(1 / cost)
return [self.population[item], cost]
def apply(path):
data = metadata.load(path)
for service in data["services"]:
filename = os.path.join(path, service["filename"])
df = load_timeseries(filename, service)
print(service)
df2 = interpolate_missing(df[service["fields"]])
classes = classify_series(df2)
preprocessed_series = {}
for k in classes["other_fields"]:
# short by one value, because we have to short the other one!
preprocessed_series[k] = df2[k][1:]
for k in classes["monotonic_fields"]:
preprocessed_series[k + "-diff"] = df2[k].diff()[1:]
newname = service["name"] + "-preprocessed.tsv.gz"
df3 = pd.DataFrame(preprocessed_series)
df3.to_csv(os.path.join(path, newname), sep="\t", compression='gzip')
service["preprocessed_filename"] = newname
service["preprocessed_fields"] = list(df3.columns)
service.update(classes)
metadata.save(path, data)
def centroids(path):
metadata = load_metadata(path)
d = {}
for srv in metadata["services"]:
name = "%s/%s-cluster-1_1.tsv" % (path, srv["name"])
df = pd.read_csv(name, sep="\t", index_col='time', parse_dates=True)
d[srv["name"]] = df.centroid
df2 = pd.DataFrame(d)
df2 = df2.fillna(method="bfill", limit=1e9)
df2 = df2.fillna(method="ffill", limit=1e9)
fig = df2.plot()
handles, labels = fig.get_legend_handles_labels()
fig.grid('on')
lgd = fig.legend(handles, labels, loc='upper center', bbox_to_anchor=(0.5,-0.1))
plt.savefig("graph.png", bbox_extra_artists=(lgd,), bbox_inches='tight')
plt.close("all")
def test_pd_outer_join():
dfs = [
pd.DataFrame({
'id': [0, 1, 2, 3],
'a': ['foo', 'bar', 'baz', np.nan],
'b': ['panda', 'zebra', np.nan, np.nan],
}),
pd.DataFrame({
'id': [1, 2, 3, 4],
'b': ['mouse', np.nan, 'tiger', 'egret'],
'c': ['toe', 'finger', 'nose', np.nan],
}),
]
expected = pd.DataFrame({
'id': [0, 1, 2, 3, 4],
'a': ['foo', 'bar', 'baz', np.nan, np.nan],
'b': ['panda', 'zebra', np.nan, 'tiger', 'egret'],
'c': [np.nan, 'toe', 'finger', 'nose', np.nan],
}).set_index('id')
actual = pd_outer_join(dfs, on='id')
print(expected)
print(actual)
assert expected.equals(actual)
def read_image(imagery_path):
# Read image
dataset = gdal.Open(imagery_path)
dsmatrix = dataset.ReadAsArray(xoff=0, yoff=0, xsize=dataset.RasterXSize, ysize=dataset.RasterYSize)
# Get Geographic meta data
geo_trans_list = dataset.GetGeoTransform()
proj_str = dataset.GetProjection()
num_bands = dataset.RasterCount
# Adapt to one bands or multi-bands
if num_bands > 1:
# Unfold array into pandas DataFrame
rows = dsmatrix.shape[1]
cols = dsmatrix.shape[2]
data_array = dsmatrix[:,0,:]
for irow in range(1,rows):
tempmatirx = dsmatrix[:,irow,:]
data_array = np.hstack((data_array,tempmatirx))
else:
# Unfold array into pandas DataFrame
rows = dsmatrix.shape[0]
cols = dsmatrix.shape[1]
data_array = dsmatrix[0,:]
for irow in range(1,rows):
tempmatirx = dsmatrix[irow,:]
data_array = np.hstack((data_array,tempmatirx))
data_frame = pd.DataFrame(data_array.T)
return data_frame, rows, cols, geo_trans_list, proj_str, num_bands
def __init__(self, *args, **kwargs):
'''
The same arguments as for pandas.DataFrame
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html
data argument should be a list of XSeries objects or dict of XSeries objects.
In dict is passed, key must be a string and it's indicate appropriate column name.
For example, to create XDataFrame data should looks like
data = {'col_1': s_1, 'col_2': s_2, ..., 'col_n': s_n} where s_i is a XSeries
'''
data = kwargs.get('data')
if data is None:
data = args[0]
data_to_check = []
if isinstance(data, list):
data_to_check = data
elif isinstance(data, dict):
data_to_check = data.values()
for d in data_to_check:
if not isinstance(d, XSeries):
raise ValueError('All data must be XSeries instances')
super(XDataFrame, self).__init__(*args, **kwargs)
def to_pandas_dataframe(self):
'''
Convert self to pandas.DataFrame if all columns are primitive types.
See more at XSeries.to_pandas_series
:return:
'''
data_types = self.get_data_types()
is_all_columns_are_primitive = all(
_is_class_a_primitive(dt)
for dt in data_types
)
if is_all_columns_are_primitive:
self.__class__ = pd.DataFrame
else:
raise ValueError('Unable to cast to pd.DataFrame. {} is not all primitives.'.format(self.data_types))
return self
def _create_daily_stats(self, perfs):
# create daily and cumulative stats dataframe
daily_perfs = []
# TODO: the loop here could overwrite expected properties
# of daily_perf. Could potentially raise or log a
# warning.
for perf in perfs:
if 'daily_perf' in perf:
perf['daily_perf'].update(
perf['daily_perf'].pop('recorded_vars')
)
perf['daily_perf'].update(perf['cumulative_risk_metrics'])
daily_perfs.append(perf['daily_perf'])
else:
self.risk_report = perf
daily_dts = [np.datetime64(perf['period_close'], utc=True)
for perf in daily_perfs]
daily_stats = pd.DataFrame(daily_perfs, index=daily_dts)
return daily_stats
def _pipeline_output(self, pipeline, chunks):
"""
Internal implementation of `pipeline_output`.
"""
today = normalize_date(self.get_datetime())
try:
data = self._pipeline_cache.unwrap(today)
except Expired:
data, valid_until = self._run_pipeline(
pipeline, today, next(chunks),
)
self._pipeline_cache = CachedObject(data, valid_until)
# Now that we have a cached result, try to return the data for today.
try:
return data.loc[today]
except KeyError:
# This happens if no assets passed the pipeline screen on a given
# day.
return pd.DataFrame(index=[], columns=data.columns)
def frame_from_bardata(self, data, algo_dt):
"""
Create a DataFrame from the given BarData and algo dt.
"""
data = data._data
frame_data = np.empty((len(self.fields), len(self.sids))) * np.nan
for j, sid in enumerate(self.sids):
sid_data = data.get(sid)
if not sid_data:
continue
if algo_dt != sid_data['dt']:
continue
for i, field in enumerate(self.fields):
frame_data[i, j] = sid_data.get(field, np.nan)
return pd.DataFrame(
frame_data,
index=self.fields.copy(),
columns=self.sids.copy(),
)
def update_dividends(self, new_dividends):
"""
Update our dividend frame with new dividends. @new_dividends should be
a DataFrame with columns containing at least the entries in
zipline.protocol.DIVIDEND_FIELDS.
"""
# Mark each new dividend with a unique integer id. This ensures that
# we can differentiate dividends whose date/sid fields are otherwise
# identical.
new_dividends['id'] = np.arange(
self._dividend_count,
self._dividend_count + len(new_dividends),
)
self._dividend_count += len(new_dividends)
self.dividend_frame = sort_values(pd.concat(
[self.dividend_frame, new_dividends]
), ['pay_date', 'ex_date']).set_index('id', drop=False)
def create_test_panel_ohlc_source(sim_params, env):
start = sim_params.first_open \
if sim_params else pd.datetime(1990, 1, 3, 0, 0, 0, 0, pytz.utc)
end = sim_params.last_close \
if sim_params else pd.datetime(1990, 1, 8, 0, 0, 0, 0, pytz.utc)
index = env.days_in_range(start, end)
price = np.arange(0, len(index)) + 100
high = price * 1.05
low = price * 0.95
open_ = price + .1 * (price % 2 - .5)
volume = np.ones(len(index)) * 1000
arbitrary = np.ones(len(index))
df = pd.DataFrame({'price': price,
'high': high,
'low': low,
'open': open_,
'volume': volume,
'arbitrary': arbitrary},
index=index)
panel = pd.Panel.from_dict({0: df})
return DataPanelSource(panel), panel
def add_frame(self, tick, frame, minor_axis=None, items=None):
"""
"""
if self._pos == self.cap:
self._roll_data()
if isinstance(frame, pd.DataFrame):
minor_axis = frame.columns
items = frame.index
if set(minor_axis).difference(set(self.minor_axis)) or \
set(items).difference(set(self.items)):
self._update_buffer(frame)
vals = frame.T.astype(self.dtype)
self.buffer.loc[:, self._pos, :] = vals
self.date_buf[self._pos] = tick
self._pos += 1
def getindexdaily(self,code,start,end):
total=[]
startdate = datetime.datetime.strptime(start, "%Y-%m-%d")
enddate=datetime.datetime.strptime(end, "%Y-%m-%d")
series={"date":[],"open":[],"close":[],"high":[],"low":[],"volume":[]}
for stockdaily in self.index[code].find({"date": {"$gte": startdate,"$lt":enddate}}).sort("date"):
series["date"].append(stockdaily["date"])
series["open"].append(stockdaily["open"])
series["close"].append(stockdaily["close"])
series["high"].append(stockdaily["high"])
series["low"].append(stockdaily["low"])
series["volume"].append(stockdaily["volume"])
totaldata=zip(series['date'],series['open'],series['close'],series['high'],series['low'],series['volume'])
df = pd.DataFrame(list(totaldata))
df.index=df.date
return df
def read_treasure_from_mongodb(self,start,end):
startdate=start
enddate=end
series={"Time Period":[],"1month":[],"3month":[],"6month":[],"1year":[],"2year":[],"3year":[],"5year":[],"7year":[],"10year":[],"20year":[],"30year":[]}
if type(start) is types.StringType:
startdate = datetime.datetime.strptime(start, "%Y-%m-%d")
if type(end) is types.StringType:
enddate=datetime.datetime.strptime(end, "%Y-%m-%d")
for treasuredaily in self.treasure['treasure'].find({"Time Period": {"$gte": startdate,"$lt":enddate}}).sort("date"):
series["Time Period"].append(treasuredaily["Time Period"])
series["1month"].append(treasuredaily["1month"])
series["3month"].append(treasuredaily["3month"])
series["6month"].append(treasuredaily["6month"])
series["1year"].append(treasuredaily["1year"])
series["2year"].append(treasuredaily["2year"])
series["3year"].append(treasuredaily["3year"])
series["5year"].append(treasuredaily["5year"])
series["7year"].append(treasuredaily["7year"])
series["10year"].append(treasuredaily["10year"])
series["20year"].append(treasuredaily["20year"])
series["30year"].append(treasuredaily["30year"])
totaldata=zip(series["1month"],series["3month"],series["6month"],series["1year"],series["2year"],series["3year"],series["5year"],series["7year"],series["10year"],series["20year"],series["30year"])
df = pd.DataFrame(data=list(totaldata),index=series["Time Period"],columns = ['1month', '3month','6month', '1year', '2year', '3year', '5year', '7year', '10year', '20year', '30year'])
return df.sort_index().tz_localize('UTC')
def __init__(self, constants, dates, sids):
loaders = {}
for column, const in iteritems(constants):
frame = DataFrame(
const,
index=dates,
columns=sids,
dtype=column.dtype,
)
loaders[column] = DataFrameLoader(
column=column,
baseline=frame,
adjustments=None,
)
self._loaders = loaders
def test_consume_metadata(self):
# Test dict consumption
dict_to_consume = {0: {'symbol': 'PLAY'},
1: {'symbol': 'MSFT'}}
self.env.write_data(equities_data=dict_to_consume)
finder = self.asset_finder_type(self.env.engine)
equity = finder.retrieve_asset(0)
self.assertIsInstance(equity, Equity)
self.assertEqual('PLAY', equity.symbol)
# Test dataframe consumption
df = pd.DataFrame(columns=['asset_name', 'exchange'], index=[0, 1])
df['asset_name'][0] = "Dave'N'Busters"
df['exchange'][0] = "NASDAQ"
df['asset_name'][1] = "Microsoft"
df['exchange'][1] = "NYSE"
self.env = TradingEnvironment(load=noop_load)
self.env.write_data(equities_df=df)
finder = self.asset_finder_type(self.env.engine)
self.assertEqual('NASDAQ', finder.retrieve_asset(0).exchange)
self.assertEqual('Microsoft', finder.retrieve_asset(1).asset_name)
def setUp(self):
self.env = TradingEnvironment()
self.days = self.env.trading_days[:5]
self.panel = pd.Panel({1: pd.DataFrame({
'price': [1, 1, 2, 4, 8], 'volume': [1e9, 1e9, 1e9, 1e9, 0],
'type': [DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.CLOSE_POSITION]},
index=self.days)
})
self.no_close_panel = pd.Panel({1: pd.DataFrame({
'price': [1, 1, 2, 4, 8], 'volume': [1e9, 1e9, 1e9, 1e9, 1e9],
'type': [DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.TRADE,
DATASOURCE_TYPE.TRADE]},
index=self.days)
})
def test_bfill(self):
# test ndim=1
N = 100
s = pd.Series(np.random.randn(N))
mask = random.sample(range(N), 10)
s.iloc[mask] = np.nan
correct = s.bfill().values
test = bfill(s.values)
assert_almost_equal(correct, test)
# test ndim=2
df = pd.DataFrame(np.random.randn(N, N))
df.iloc[mask] = np.nan
correct = df.bfill().values
test = bfill(df.values)
assert_almost_equal(correct, test)