def df_add(self,column,added_info):
'''
Args
column (string):
the column name to be played with
added_info (string, int, float or pandas.DataFrame):
The information to be added to the selected column can be string, int, float, or
pandas.DataFrame
Returns
-
'''
if isinstance(added_info,str):
self.data_df[column] = self.data_df[column] + self.data_df[added_info]
elif isinstance(added_info,(int,float)):
self.data_df[column] = self.data_df[column] + added_info
elif isinstance(added_info,(pd.Series,pd.DataFrame)):
self.data_df[column] = self.data_df[column] + added_info
# This function performs minus to a given column
python类Series()的实例源码
def df_minus(self,column,minus_info):
'''
Args
column (string):
the column name to be played with
minus_info (string, int, float or pandas.DataFrame):
information to be subtracted from the selected column
Returns
-
'''
if isinstance(minus_info,str):
self.data_df[column] = self.data_df[column] - self.data_df[minus_info]
elif isinstance(minus_info,(int,float)):
self.data_df[column] = self.data_df[column] - minus_info
elif isinstance(added_info,(pd.Series,pd.DataFrame)):
self.data_df[column] = self.data_df[column] - added_info
# This function multiplys the selected column with certain factor
def df_multiply(self,column,multiply_info):
'''
Args
column (string):
the column name to be played with
multiply_info (string, int, float or pandas.DataFrame):
information to be used for multiplying
Returns
-
'''
if isinstance(multiply_info,str):
self.data_df[column] = self.data_df[column] * self.data_df[multiply_info]
elif isinstance(multiply_info,(int,float)):
self.data_df[column] = self.data_df[column] * multiply_info
elif isinstance(added_info,(pd.Series,pd.DataFrame)):
self.data_df[column] = self.data_df[column] * added_info
# This function divides the selected column by certain factor
def df_division(self,column,division_info):
'''
Args
column (string):
the column name to be played with
division_info (string, int, float or pandas.DataFrame):
information to be used for dividing
Returns
-
'''
if isinstance(division_info,str):
self.data_df[column] = self.data_df[column] / self.data_df[division_info]
elif isinstance(division_info,(int,float)):
self.data_df[column] = self.data_df[column] / division_info
elif isinstance(added_info,(pd.Series,pd.DataFrame)):
self.data_df[column] = self.data_df[column] / added_info
# delete certain trials in the data table
def test_ABS():
text = """
ABS(X);
"""
param1 = {
'X': pd.Series([-2, -1, -0.5, 9.8]),
'RESULT': pd.Series([2, 1, 0.5, 9.8])
}
param2 = {
'X': pd.Series([-2, -1, 0, 9]),
'RESULT': pd.Series([2, 1, 0, 9])
}
params = [param1, param2]
testfunc(text, params)
def test_SMA():
text = """
SMA(X, M, N);
"""
param1 = {
'X': pd.Series([10.2, 30.9, 30.48, 39.34, 43.3, 45.9, 30.48, 39.34, 45.9, 30.48, 39.34]),
'M': 5,
'N': 3,
'RESULT': pd.Series(
[10.2, 24.985714, 28.507692, 35.177833, 40.101552, 43.594930, 35.713058, 37.890650, 42.697520, 35.366239,
37.750596])
}
params = [param1]
testfunc(text, params, True, True)
def CROSS(self, param):
if not isinstance(param[0], pd.core.series.Series) and not isinstance(param[1], pd.core.series.Series):
print('Invalid data type is detected.')
return False
if not isinstance(param[0], pd.core.series.Series):
x1 = param[0]
x2 = param[0]
y1 = param[1].shift(1)
y2 = param[1]
if not isinstance(param[1], pd.core.series.Series):
x1 = param[0].shift(1)
x2 = param[0]
y1 = param[1]
y2 = param[1]
if isinstance(param[0], pd.core.series.Series) and isinstance(param[1], pd.core.series.Series):
x1 = param[0].shift(1)
x2 = param[0]
y1 = param[1].shift(1)
y2 = param[1]
return (x1 <= y1) & (x2 > y2)
def MAX(self, param):
if isinstance(param[0], pd.core.series.Series):
df = pd.DataFrame(index = param[0].index)
elif isinstance(param[1], pd.core.series.Series):
df = pd.DataFrame(index = param[1].index)
else:
df = None
if df is None:
return np.max(param)
df['A'] = param[0]
df['B'] = param[1]
def callback(row):
if row['A'] >= row['B']:
return row['A']
else:
return row['B']
result = df.apply(callback, axis = 1, reduce = True)
return result
def MIN(self, param):
if isinstance(param[0], pd.core.series.Series):
df = pd.DataFrame(index = param[0].index)
elif isinstance(param[1], pd.core.series.Series):
df = pd.DataFrame(index = param[1].index)
else:
df = None
if df is None:
return np.max(param)
df['A'] = param[0]
df['B'] = param[1]
def callback(row):
if row['A'] <= row['B']:
return row['A']
else:
return row['B']
result = df.apply(callback, axis = 1, reduce = True)
return result
def setUp(self):
scores = pd.Series(np.ones(8), dtype=np.float32)
np_data = np.array([
[1, 'a'],
[2, 'b'],
[4, 'a'],
[3, 'c'],
[3, 'b'],
[5, 'c'],
[4, 'c'],
[1, 'b'],
])
col_labels = ['item_id', 'link_id']
self.input_df = pd.DataFrame(data=np_data, columns=col_labels)
self.input_df['score'] = scores
self.sparse = SparseTransform()
self.out = self.sparse.transform(self.input_df)
def rise_rate(df):
date1_2 = df[record_date].map(lambda x: str2time(x)).max()
date1_1 = datetime.datetime(date1_2.year, date1_2.month, 1).date()
grouped1 = DataView(df).filter_by_record_date2(date1_1, date1_2)[[user_id, power_consumption]].groupby([user_id], as_index=False).mean()
from dateutil.relativedelta import relativedelta
date2_1 = date1_1 - relativedelta(months=+1)
date2_2 = date1_2 - relativedelta(months=+1)
grouped2 = DataView(df).filter_by_record_date2(date2_1, date2_2)[[user_id, power_consumption]].groupby([user_id], as_index=False).mean()
print(date1_1,date1_2, date2_1, date2_2)
print(grouped1)
print(grouped2)
user_rise_rate = pd.Series(map(lambda x, y: float(x - y) / y, grouped1[power_consumption], grouped2[power_consumption]))
user_rise_rate.name = 'user_rise_rate'
return grouped1[[user_id]].join(user_rise_rate)
# ?????
def create_agents(self, generator):
"""
Given information on a set of countries and a generator function,
generate the agents and assign the results to ``self.agents``.
:type generator: DataFrame, str, int
:param generator: A function which generates the agents.
"""
self.generator = generator
country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
country_array.index = range(len(country_array))
# Garbage collect before creating new processes.
gc.collect()
self.agents = pd.concat(
self.pool.imap(self._gen_agents,
np.array_split(country_array, self.processes * self.splits))
)
self.agents.index = range(len(self.agents))
def create_agents(self, generator):
"""
Given information on a set of countries and a generator function,
generate the agents and assign the results to ``self.agents``.
:type generator: DataFrame, str, int
:param generator: A function which generates the agents.
"""
self.generator = generator
country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
country_array.index = range(len(country_array))
# Garbage collect before creating new processes.
gc.collect()
self.agents = pd.concat(
self.pool.imap(self._gen_agents,
np.array_split(country_array, self.processes * self.splits))
)
self.agents.index = range(len(self.agents))
def minScalErr(stec,el,z,thisBias):
"""
this determines the slope of the vTEC vs. Elevation line, which
should be minimized in the minimum scalloping technique for
receiver bias removal
inputs:
stec - time indexed Series of slant TEC values
el - corresponding elevation values, also Series
z - mapping function values to convert to vTEC from entire file, may
contain nans, Series
thisBias - the bias to be tested and minimized
"""
intel=np.asarray(el[stec.index],int) # bin the elevation values into int
sTEC=np.asarray(stec,float)
zmap = z[stec.index]
c=np.array([(i,np.average((sTEC[intel==i]-thisBias)
/zmap[intel==i])) for i in np.unique(intel) if i>30])
return np.polyfit(c[:,0],c[:,1],1)[0]
def generate_summary(df):
level_counts = df.Level.value_counts().to_dict()
zlist = list(zip(*[('<a href="#info">Items Processed Succesfully</a>', level_counts.get('INFO', 0)),
('<a href="#warning">Items Skipped Due to a Warning</a>', level_counts.get('WARNING', 0)),
('<a href="#error">Items Skipped Due to an Error</a>', level_counts.get('ERROR', 0))]))
level_counts = pd.Series(zlist[1], index=zlist[0])
level_counts.name = "Count"
info_counts = df.query("Level == 'INFO'").Message.value_counts().to_dict()
zlist = list(zip(*[('No Action', info_counts.get('SKIP', 0)),
('Update', info_counts.get('UPDATE', 0)),
('Create', info_counts.get('CREATE', 0))]))
info_counts = pd.Series(zlist[1], index=zlist[0])
info_counts.name = "Count"
warning_counts = df.query("Level == 'WARNING'")['Msg Type'].value_counts()
warning_counts.name = "Count"
error_counts = df.query("Level == 'ERROR'")['Msg Type'].value_counts()
error_counts.name = "Count"
return level_counts, info_counts, warning_counts, error_counts
def _format_min_growth(min_growth, species):
"""Format min_growth into a pandas series.
Arguments
---------
min_growth : positive float or array-like object.
The minimum growth rate for each individual in the community. Either
a single value applied to all individuals or one value for each.
species : array-like
The ID for each individual model in the community.
Returns
-------
pandas.Series
A pandas Series mapping each individual to its minimum growth rate.
"""
try:
min_growth = float(min_growth)
except (TypeError, ValueError):
if len(min_growth) != len(species):
raise ValueError(
"min_growth must be single value or an array-like "
"object with an entry for each species in the model.")
return pd.Series(min_growth, species)
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def clean_data(self, df, is_with_MICE=0):
df = df.copy()
if df.isnull().sum().sum() > 0:
if is_with_MICE:
# Imputation using MICE
numerical_features_names = self.extract_numerical_features(df)
df.loc[:, tuple(numerical_features_names)] = self.estimate_by_mice(df[numerical_features_names])
else:
if any(tuple(df.columns == 'y')):
df = df.dropna()
else:
df = df.dropna(1)
TwoSigmaFinModTools._feature_names_num = pd.Series(data=np.intersect1d(
TwoSigmaFinModTools._feature_names_num.values, df.columns), dtype=object)
TwoSigmaFinModTools._numerical_feature_names = TwoSigmaFinModTools.extract_numerical_features(df)
return df
def predict_job(job_list):
"""Assign a classification to a url"""
# TODO: Add case where len is 1 or 0....
job_list = [job for j in job_list for job in j]
new_job_list = [regex.tokenize_and_stem(i) for i in job_list]
new_job_list = [' '.join(job) for job in new_job_list]
vect = CountVectorizer()
x_series = pd.Series(X)
X_train_dtm = vect.fit_transform(x_series)
y_train = pd.Series(y)
job_list_series = pd.Series(new_job_list)
job_list_dtm = vect.transform(job_list_series)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred = nb.predict(job_list_dtm)
# for i in range(len(job_list)):
# print(job_list[i], y_pred[i])
return y_pred
# print(predict_job([('Founder',), ('Founder',), ('Architect & Full-stack developer',), ('Senior Engineer',), ('Technical Consultant',)]))
def count_pairs(data):
df = pd.DataFrame(data)
start, target = df.columns.tolist()
# first we create groups for each pair and take size of each group as count.
# counts is a pandas.Series with the pairs as index
counts = df.groupby([start, target]).size()
# than we remove duplicate pairs from original dateframe,
# so length and counts are equal in size
df = df.drop_duplicates()
# reset index to values of pairs to fit index of counts
df.set_index([0, 1], inplace=True, drop=False)
# now we append the counts as column to the original data
df[2] = pd.Series(counts.values, index=counts.index)
# just cast pandas-dataframe back to numpy 2d-array usable for following
# steps
array = df.values
return array
def _hpd_interval(self, x, width):
"""
Code adapted from pymc3.stats.calc_min_interval:
https://github.com/pymc-devs/pymc3/blob/master/pymc3/stats.py
"""
x = np.sort(x)
n = len(x)
interval_idx_inc = int(np.floor(width * n))
n_intervals = n - interval_idx_inc
interval_width = x[interval_idx_inc:] - x[:n_intervals]
if len(interval_width) == 0:
raise ValueError('Too few elements for interval calculation')
min_idx = np.argmin(interval_width)
hdi_min = x[min_idx]
hdi_max = x[min_idx + interval_idx_inc]
index = ['hpd{}_{}'.format(width, x) for x in ['lower', 'upper']]
return pd.Series([hdi_min, hdi_max], index=index)
def regression(nx, ny):
"""
Parameters
==========
specturm : pd.series
Pandas Series object
nodes : list
of nodes to be used for the continuum
Returns
=======
corrected : array
Continuum corrected array
continuum : array
The continuum used to correct the data
x : array
The potentially truncated x values
"""
m, b, r_value, p_value, stderr = ss.linregress(nx, ny)
c = m * nx + b
return c
def fit(self,init_data,data):
"""
Import data to SPOT object
Parameters
----------
init_data : list, numpy.array or pandas.Series
initial batch to calibrate the algorithm
data : numpy.array
data for the run (list, np.array or pd.series)
"""
if isinstance(data,list):
self.data = np.array(data)
elif isinstance(data,np.ndarray):
self.data = data
elif isinstance(data,pd.Series):
self.data = data.values
else:
print('This data format (%s) is not supported' % type(data))
return
if isinstance(init_data,list):
self.init_data = np.array(init_data)
elif isinstance(init_data,np.ndarray):
self.init_data = init_data
elif isinstance(init_data,pd.Series):
self.init_data = init_data.values
elif isinstance(init_data,int):
self.init_data = self.data[:init_data]
self.data = self.data[init_data:]
elif isinstance(init_data,float) & (init_data<1) & (init_data>0):
r = int(init_data*data.size)
self.init_data = self.data[:r]
self.data = self.data[r:]
else:
print('The initial data cannot be set')
return
def fit(self,init_data,data):
"""
Import data to biSPOT object
Parameters
----------
init_data : list, numpy.array or pandas.Series
initial batch to calibrate the algorithm ()
data : numpy.array
data for the run (list, np.array or pd.series)
"""
if isinstance(data,list):
self.data = np.array(data)
elif isinstance(data,np.ndarray):
self.data = data
elif isinstance(data,pd.Series):
self.data = data.values
else:
print('This data format (%s) is not supported' % type(data))
return
if isinstance(init_data,list):
self.init_data = np.array(init_data)
elif isinstance(init_data,np.ndarray):
self.init_data = init_data
elif isinstance(init_data,pd.Series):
self.init_data = init_data.values
elif isinstance(init_data,int):
self.init_data = self.data[:init_data]
self.data = self.data[init_data:]
elif isinstance(init_data,float) & (init_data<1) & (init_data>0):
r = int(init_data*data.size)
self.init_data = self.data[:r]
self.data = self.data[r:]
else:
print('The initial data cannot be set')
return
def fit(self,init_data,data):
"""
Import data to DSPOT object
Parameters
----------
init_data : list, numpy.array or pandas.Series
initial batch to calibrate the algorithm
data : numpy.array
data for the run (list, np.array or pd.series)
"""
if isinstance(data,list):
self.data = np.array(data)
elif isinstance(data,np.ndarray):
self.data = data
elif isinstance(data,pd.Series):
self.data = data.values
else:
print('This data format (%s) is not supported' % type(data))
return
if isinstance(init_data,list):
self.init_data = np.array(init_data)
elif isinstance(init_data,np.ndarray):
self.init_data = init_data
elif isinstance(init_data,pd.Series):
self.init_data = init_data.values
elif isinstance(init_data,int):
self.init_data = self.data[:init_data]
self.data = self.data[init_data:]
elif isinstance(init_data,float) & (init_data<1) & (init_data>0):
r = int(init_data*data.size)
self.init_data = self.data[:r]
self.data = self.data[r:]
else:
print('The initial data cannot be set')
return
def fit(self,init_data,data):
"""
Import data to biDSPOT object
Parameters
----------
init_data : list, numpy.array or pandas.Series
initial batch to calibrate the algorithm
data : numpy.array
data for the run (list, np.array or pd.series)
"""
if isinstance(data,list):
self.data = np.array(data)
elif isinstance(data,np.ndarray):
self.data = data
elif isinstance(data,pd.Series):
self.data = data.values
else:
print('This data format (%s) is not supported' % type(data))
return
if isinstance(init_data,list):
self.init_data = np.array(init_data)
elif isinstance(init_data,np.ndarray):
self.init_data = init_data
elif isinstance(init_data,pd.Series):
self.init_data = init_data.values
elif isinstance(init_data,int):
self.init_data = self.data[:init_data]
self.data = self.data[init_data:]
elif isinstance(init_data,float) & (init_data<1) & (init_data>0):
r = int(init_data*data.size)
self.init_data = self.data[:r]
self.data = self.data[r:]
else:
print('The initial data cannot be set')
return
def train(self, x):
"""
Train scale
Parameters
----------
x: pd.series | np.array
a column of data to train over
"""
raise NotImplementedError('Not Implemented')
def transform(self, x):
"""
Transform array|series x
"""
raise NotImplementedError('Not Implemented')
def inverse(self, x):
"""
Inverse transform array|series x
"""
raise NotImplementedError('Not Implemented')
def train(self, x, drop=None):
"""
Train scale
Parameters
----------
x: pd.series| np.array
a column of data to train over
A discrete range is stored in a list
"""
if not len(x):
return
self.range.train(x, drop)
def transform(self, x):
"""
Transform array|series x
"""
# Discrete scales do not do transformations
return x