def mungeskewed(train, test, numeric_feats):
ntrain = train.shape[0]
test['loss'] = 0
train_test = pd.concat((train, test)).reset_index(drop=True)
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
skewed_feats = skewed_feats[skewed_feats > 0.25]
skewed_feats = skewed_feats.index
for feats in skewed_feats:
train_test[feats] = train_test[feats] + 1
train_test[feats], lam = boxcox(train_test[feats])
return train_test, ntrain
python类skew()的实例源码
def features(self, q1, q2):
q1 = str(q1).lower().split()
q2 = str(q2).lower().split()
q1 = [w for w in q1 if w not in stopwords]
q2 = [w for w in q2 if w not in stopwords]
wmd = min(self.model.wmdistance(q1, q2), 10)
q1vec = self.sent2vec(q1)
q2vec = self.sent2vec(q2)
if q1vec is not None and q2vec is not None:
cos = cosine(q1vec, q2vec)
city = cityblock(q1vec, q2vec)
jacc = jaccard(q1vec, q2vec)
canb = canberra(q1vec, q2vec)
eucl = euclidean(q1vec, q2vec)
mink = minkowski(q1vec, q2vec, 3)
bray = braycurtis(q1vec, q2vec)
q1_skew = skew(q1vec)
q2_skew = skew(q2vec)
q1_kurt = kurtosis(q1vec)
q2_kurt = kurtosis(q2vec)
else:
cos = -1
city = -1
jacc = -1
canb = -1
eucl = -1
mink = -1
bray = -1
q1_skew = 0
q2_skew = 0
q1_kurt = 0
q2_kurt = 0
return wmd, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt
def features(self, q1, q2):
q1 = str(q1).lower().split()
q2 = str(q2).lower().split()
q1 = [w for w in q1 if w not in stopwords]
q2 = [w for w in q2 if w not in stopwords]
wmd = min(self.model.wmdistance(q1, q2), 10)
wmd_norm = min(self.model_norm.wmdistance(q1, q2), 10)
q1vec = self.sent2vec(q1)
q2vec = self.sent2vec(q2)
if q1vec is not None and q2vec is not None:
cos = cosine(q1vec, q2vec)
city = cityblock(q1vec, q2vec)
jacc = jaccard(q1vec, q2vec)
canb = canberra(q1vec, q2vec)
eucl = euclidean(q1vec, q2vec)
mink = minkowski(q1vec, q2vec, 3)
bray = braycurtis(q1vec, q2vec)
q1_skew = skew(q1vec)
q2_skew = skew(q2vec)
q1_kurt = kurtosis(q1vec)
q2_kurt = kurtosis(q2vec)
else:
cos = -1
city = -1
jacc = -1
canb = -1
eucl = -1
mink = -1
bray = -1
q1_skew = 0
q2_skew = 0
q1_kurt = 0
q2_kurt = 0
return wmd, wmd_norm, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt
def lightcurve_moments(ftimes, fmags, ferrs):
'''This calculates the weighted mean, stdev, median, MAD, percentiles, skew,
kurtosis, fraction of LC beyond 1-stdev, and IQR.
'''
ndet = len(fmags)
if ndet > 9:
# now calculate the various things we need
series_median = npmedian(fmags)
series_wmean = (
npsum(fmags*(1.0/(ferrs*ferrs)))/npsum(1.0/(ferrs*ferrs))
)
series_mad = npmedian(npabs(fmags - series_median))
series_stdev = 1.483*series_mad
series_skew = spskew(fmags)
series_kurtosis = spkurtosis(fmags)
# get the beyond1std fraction
series_above1std = len(fmags[fmags > (series_median + series_stdev)])
series_below1std = len(fmags[fmags < (series_median - series_stdev)])
# this is the fraction beyond 1 stdev
series_beyond1std = (series_above1std + series_below1std)/float(ndet)
# get the magnitude percentiles
series_mag_percentiles = nppercentile(
fmags,
[5.0,10,17.5,25,32.5,40,60,67.5,75,82.5,90,95]
)
return {
'median':series_median,
'wmean':series_wmean,
'mad':series_mad,
'stdev':series_stdev,
'skew':series_skew,
'kurtosis':series_kurtosis,
'beyond1std':series_beyond1std,
'mag_percentiles':series_mag_percentiles,
'mag_iqr': series_mag_percentiles[8] - series_mag_percentiles[3],
}
else:
LOGERROR('not enough detections in this magseries '
'to calculate light curve moments')
return None
def create_scipy_features(base_features, sentinel):
r"""Calculate the skew, kurtosis, and other statistical features
for each row.
Parameters
----------
base_features : numpy array
The feature dataframe.
sentinel : float
The number to be imputed for NaN values.
Returns
-------
sp_features : numpy array
The calculated SciPy features.
"""
logger.info("Creating SciPy Features")
# Generate scipy features
logger.info("SciPy Feature: geometric mean")
row_gmean = sps.gmean(base_features, axis=1)
logger.info("SciPy Feature: kurtosis")
row_kurtosis = sps.kurtosis(base_features, axis=1)
logger.info("SciPy Feature: kurtosis test")
row_ktest, pvalue = sps.kurtosistest(base_features, axis=1)
logger.info("SciPy Feature: normal test")
row_normal, pvalue = sps.normaltest(base_features, axis=1)
logger.info("SciPy Feature: skew")
row_skew = sps.skew(base_features, axis=1)
logger.info("SciPy Feature: skew test")
row_stest, pvalue = sps.skewtest(base_features, axis=1)
logger.info("SciPy Feature: variation")
row_var = sps.variation(base_features, axis=1)
logger.info("SciPy Feature: signal-to-noise ratio")
row_stn = sps.signaltonoise(base_features, axis=1)
logger.info("SciPy Feature: standard error of mean")
row_sem = sps.sem(base_features, axis=1)
sp_features = np.column_stack((row_gmean, row_kurtosis, row_ktest,
row_normal, row_skew, row_stest,
row_var, row_stn, row_sem))
sp_features = impute_values(sp_features, 'float64', sentinel)
sp_features = StandardScaler().fit_transform(sp_features)
# Return new SciPy features
logger.info("SciPy Feature Count : %d", sp_features.shape[1])
return sp_features
#
# Function create_clusters
#
def create_images_for_labeling(pars):
import scipy.stats as st
import os
import numpy as np
import calblitz as cb
from glob import glob
try:
f_name=pars
cdir=os.path.dirname(f_name)
print 'loading'
m=cb.load(f_name)
print 'corr image'
img=m.local_correlations(eight_neighbours=True)
im=cb.movie(img,fr=1)
im.save(os.path.join(cdir,'correlation_image.tif'))
print 'std image'
img=np.std(m,0)
im=cb.movie(np.array(img),fr=1)
im.save(os.path.join(cdir,'std_projection.tif'))
m1=m.resize(1,1,1./m.fr)
print 'median image'
img=np.median(m1,0)
im=cb.movie(np.array(img),fr=1)
im.save(os.path.join(cdir,'median_projection.tif'))
print 'save BL'
m1=m1-img
m1.save(os.path.join(cdir,'MOV_BL.tif'))
m1=m1.bilateral_blur_2D()
m1.save(os.path.join(cdir,'MOV_BL_BIL.tif'))
m=np.array(m1)
print 'max image'
img=np.max(m,0)
im=cb.movie(np.array(img),fr=1)
im.save(os.path.join(cdir,'max_projection.tif'))
print 'skew image'
img=st.skew(m,0)
im=cb.movie(img,fr=1)
im.save(os.path.join(cdir,'skew_projection.tif'))
del m
del m1
except Exception, e:
return e
return f_name
test_window.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def test_rolling_functions_window_non_shrinkage(self):
# GH 7764
s = Series(range(4))
s_expected = Series(np.nan, index=s.index)
df = DataFrame([[1, 5], [3, 2], [3, 9], [-1, 0]], columns=['A', 'B'])
df_expected = DataFrame(np.nan, index=df.index, columns=df.columns)
df_expected_panel = Panel(items=df.index, major_axis=df.columns,
minor_axis=df.columns)
functions = [lambda x: (x.rolling(window=10, min_periods=5)
.cov(x, pairwise=False)),
lambda x: (x.rolling(window=10, min_periods=5)
.corr(x, pairwise=False)),
lambda x: x.rolling(window=10, min_periods=5).max(),
lambda x: x.rolling(window=10, min_periods=5).min(),
lambda x: x.rolling(window=10, min_periods=5).sum(),
lambda x: x.rolling(window=10, min_periods=5).mean(),
lambda x: x.rolling(window=10, min_periods=5).std(),
lambda x: x.rolling(window=10, min_periods=5).var(),
lambda x: x.rolling(window=10, min_periods=5).skew(),
lambda x: x.rolling(window=10, min_periods=5).kurt(),
lambda x: x.rolling(
window=10, min_periods=5).quantile(quantile=0.5),
lambda x: x.rolling(window=10, min_periods=5).median(),
lambda x: x.rolling(window=10, min_periods=5).apply(sum),
lambda x: x.rolling(win_type='boxcar',
window=10, min_periods=5).mean()]
for f in functions:
try:
s_result = f(s)
assert_series_equal(s_result, s_expected)
df_result = f(df)
assert_frame_equal(df_result, df_expected)
except (ImportError):
# scipy needed for rolling_window
continue
functions = [lambda x: (x.rolling(window=10, min_periods=5)
.cov(x, pairwise=True)),
lambda x: (x.rolling(window=10, min_periods=5)
.corr(x, pairwise=True))]
for f in functions:
df_result_panel = f(df)
assert_panel_equal(df_result_panel, df_expected_panel)
def pre_process(df):
# LotFrontage's N/A is assigned zero, will it cause problem?
df.fillna(value={'MasVnrType': 'None', 'MasVnrArea': 0,'Electrical': 'SBrkr', 'FireplaceQu': 'NoFP', 'GarageType': 'Noga',
'GarageFinish': 'Noga', 'GarageQual': 'Noga', 'Fence': 'NoFence',
'BsmtFinSF1':0,'BsmtFinSF2':0,'BsmtUnfSF':0,'TotalBsmtSF':0,'BsmtFullBath':0,'BsmtHalfBath':0,
'LotFrontage': 0},
inplace=True)
df.loc[:, 'YrSold'] = 2016 - df.loc[:, 'YrSold']
df.loc[df.loc[:, 'PoolArea'] != 0, 'PoolArea'] = 1
df.loc[:, 'Porch'] = np.sum(df.loc[:, ['EnclosedPorch', '3SsnPorch', 'ScreenPorch']], axis=1)
df.drop(['EnclosedPorch', '3SsnPorch', 'ScreenPorch'], axis=1, inplace=True)
df.replace({'BsmtFullBath': {3: 2}, 'LotShape': {'IR3': 'IR2'}}, inplace=True)
# fill missing values in bsmt
df = fill_bsmt_missing(df)
def fill_na(df, col_name, value = None):
if value == None:
value = df[col_name].mean()
df.loc[df[col_name].isnull(),col_name] = value
fill_na(df, 'Fence','WD')
fill_na(df, 'GarageArea')
fill_na(df, 'GarageCars')
fill_na(df, 'SaleType', df['SaleType'].mode().values[0])
fill_na(df, 'KitchenQual', df['KitchenQual'].mode().values[0])
fill_na(df, 'Functional', df['Functional'].mode().values[0])
fill_na(df, 'Exterior1st', df['Exterior1st'].mode().values[0])
fill_na(df, 'Exterior2nd', df['Exterior2nd'].mode().values[0])
fill_na(df, 'MSZoning', 'RL')
bool_cols = np.array([df[col_name].isnull() for col_name in df.columns])
print('rows containing na:',np.sum(bool_cols.any(axis=0)))
print('rows all na:',np.sum(bool_cols.all(axis=0)))
# log1pskewed_feats
numeric_feats = df.dtypes[df.dtypes != "object"].index
skewed_feats = df[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
df[skewed_feats] = np.log1p(df[skewed_feats])
return df
#%%
#log transform the target: ignore for test data
#
#train_data = pre_process(train_df.copy())
#test_data = pre_process(test_df.copy())
def fit(self, X):
self.data["cols"] = list(set(range(X.shape[1])).difference(
np.where(np.all(X == X[0,:], axis = 0))[0]))
tX = X[:, self.data["cols"]]
if(self.algo == "min-max"):
self.data['min'] = np.min(tX, axis=0)
self.data['max'] = np.max(tX, axis=0)
elif(self.algo == "normal"):
self.data['mu'] = np.mean(tX, axis=0)
self.data['std'] = np.std(tX, axis=0)
elif(self.algo == "inv-normal"):
self.data['mu'] = np.mean(tX, axis=0)
self.data['std'] = np.std(tX, axis=0)
elif(self.algo == "auto-normal"):
self.data['min'] = np.min(tX, axis=0)
self.data['max'] = np.max(tX, axis=0)
tX = (tX-self.data["min"])/(self.data["max"]-self.data["min"])
boxcox = lambda x, lm: (np.sign(x)*np.abs(x)**lm-1)/lm
self.data['boxcox'] = np.zeros(tX.shape[1])
for d in range(tX.shape[1]):
Xd = tX[:, d]
if(np.unique(tX[:, d]).shape[0] < 10):
self.data['boxcox'][d] = 1
continue
skewness = lambda x: skew(x, bias=False)**2
t_lm = lambda lm: np.log(np.exp(lm[0])+1)
boxcox_Xd = lambda lm: boxcox(Xd, t_lm(lm))
obj = lambda lm: skewness(boxcox_Xd(lm))
bounds = [(-5, 5)]
lm = minimize(obj, [0.], method='SLSQP', bounds=bounds,
options={'ftol': 1e-8, 'maxiter':100, 'disp':False})['x']
self.data['boxcox'][d] = t_lm(lm)
lm = self.data['boxcox'][None, :]
tX = boxcox(tX, lm)
self.data['mu'] = np.mean(tX, axis=0)
self.data['std'] = np.std(tX, axis=0)
elif(self.algo == "auto-inv-normal"):
self.data['min'] = np.min(tX, axis=0)
self.data['max'] = np.max(tX, axis=0)
tX = (tX-self.data["min"])/(self.data["max"]-self.data["min"])
boxcox = lambda x, lm: (np.sign(x)*np.abs(x)**lm-1)/lm
self.data['boxcox'] = np.zeros(tX.shape[1])
for d in range(tX.shape[1]):
Xd = tX[:, d]
if(np.unique(tX[:, d]).shape[0] < 10):
self.data['boxcox'][d] = 1
continue
skewness = lambda x: skew(x, bias=False)**2
t_lm = lambda lm: np.log(np.exp(lm[0])+1)
boxcox_Xd = lambda lm: boxcox(Xd, t_lm(lm))
obj = lambda lm: skewness(boxcox_Xd(lm))
bounds = [(-5, 5)]
lm = minimize(obj, [0.], method='SLSQP', bounds=bounds,
options={'ftol': 1e-8, 'maxiter':100, 'disp':False})['x']
self.data['boxcox'][d] = t_lm(lm)
lm = self.data['boxcox'][None, :]
tX = boxcox(tX, lm)
self.data['mu'] = np.mean(tX, axis=0)
self.data['std'] = np.std(tX, axis=0)
def get_features(df_features):
print('use w2v to document presentation')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge(x['question1'], x['question2']), axis = 1)
print('get_w2v')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)
df_features['q1_unique_w2v_weight'] = df_features.q1_unique.map(lambda x: get_vector(" ".join(x)))
df_features['q2_unique_w2v_weight'] = df_features.q2_unique.map(lambda x: get_vector(" ".join(x)))
df_features['q1_unique_w2v'] = df_features.q1_unique.map(lambda x: get_weight_vector(" ".join(x)))
df_features['q2_unique_w2v'] = df_features.q2_unique.map(lambda x: get_weight_vector(" ".join(x)))
print('z_dist')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_tfidf_cos_sim')
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_w2v_calc')
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim(x['q1_unique'], x['q2_unique']), axis=1)
df_features['z_w2v_unique_dis_e_weight'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
df_features['z_w2v_unique_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
df_features['z_w2v_unique_dis_mink_w'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight'],3), axis=1)
df_features['z_w2v_unique_dis_cityblock_w'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
df_features['z_w2v_unique_dis_canberra_w'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
df_features['z_w2v_unique_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v'], x['q2_unique_w2v'],3), axis=1)
df_features['z_w2v_unique_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
df_features['z_w2v_unique_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
df_features['z_q1_unique_skew_w'] = df_features.q1_unique_w2v_weight.map(lambda x:skew(x))
df_features['z_q2_unique_skew_w'] = df_features.q2_unique_w2v_weight.map(lambda x:skew(x))
df_features['z_q1_unique_kur_w'] = df_features.q1_unique_w2v_weight.map(lambda x:kurtosis(x))
df_features['z_q2_unique_kur_w'] = df_features.q2_unique_w2v_weight.map(lambda x:kurtosis(x))
df_features['z_q1_unique_skew'] = df_features.q1_unique_w2v.map(lambda x:skew(x))
df_features['z_q2_unique_skew'] = df_features.q2_unique_w2v.map(lambda x:skew(x))
df_features['z_q1_unique_kur'] = df_features.q1_unique_w2v.map(lambda x:kurtosis(x))
df_features['z_q2_unique_kur'] = df_features.q2_unique_w2v.map(lambda x:kurtosis(x))
del df_features['q1_unique_w2v_weight']
del df_features['q2_unique_w2v_weight']
del df_features['q1_unique_w2v']
del df_features['q2_unique_w2v']
print('all done')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features.fillna(0.0)
return df_features
def _detect_artifacts(ica, raw, start_find, stop_find, ecg_ch, ecg_score_func,
ecg_criterion, eog_ch, eog_score_func, eog_criterion,
skew_criterion, kurt_criterion, var_criterion,
add_nodes):
"""Aux Function"""
from scipy import stats
nodes = []
if ecg_ch is not None:
nodes += [_ica_node('ECG', ecg_ch, ecg_score_func, ecg_criterion)]
if eog_ch not in [None, []]:
if not isinstance(eog_ch, list):
eog_ch = [eog_ch]
for idx, ch in enumerate(eog_ch):
nodes += [_ica_node('EOG %02d' % idx, ch, eog_score_func,
eog_criterion)]
if skew_criterion is not None:
nodes += [_ica_node('skewness', None, stats.skew, skew_criterion)]
if kurt_criterion is not None:
nodes += [_ica_node('kurtosis', None, stats.kurtosis, kurt_criterion)]
if var_criterion is not None:
nodes += [_ica_node('variance', None, np.var, var_criterion)]
if add_nodes is not None:
nodes.extend(add_nodes)
for node in nodes:
scores = ica.score_sources(raw, start=start_find, stop=stop_find,
target=node.target,
score_func=node.score_func)
if isinstance(node.criterion, float):
found = list(np.where(np.abs(scores) > node.criterion)[0])
else:
found = list(np.atleast_1d(abs(scores).argsort()[node.criterion]))
case = (len(found), 's' if len(found) > 1 else '', node.name)
logger.info(' found %s artifact%s by %s' % case)
ica.exclude += found
logger.info('Artifact indices found:\n ' + str(ica.exclude).strip('[]'))
if len(set(ica.exclude)) != len(ica.exclude):
logger.info(' Removing duplicate indices...')
ica.exclude = list(set(ica.exclude))
logger.info('Ready.')
default_predictor.py 文件源码
项目:Default-Credit-Card-Prediction
作者: AlexPnt
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def get_feature_stats(self):
# #get input feature
feature_input=self.feature_input.currentText()
try:
if feature_input[0]=='X':
try:
feature_index=int("".join(feature_input[1:]))
feature_index-=1
except:
QtWidgets.QMessageBox.information(self, "Wrong Format","Please enter a feature name in the format: X%d.")
return
elif "".join(feature_input[0]+feature_input[1])=='LD' or "".join(feature_input[0]+feature_input[1])=='PC':
try:
feature_index=int("".join(feature_input[2:]))
feature_index-=1
except:
QtWidgets.QMessageBox.information(self, "Wrong Format","Please enter a feature name in the format: X||LD||PC%d.")
return
else:
QtWidgets.QMessageBox.information(self, "Wrong Format","Feature names must be in the format: X%d.")
return
except:
QtWidgets.QMessageBox.information(self, "Data Not Found","Please load a dataset first.")
return
try:
max_value=self.X[:,feature_index].max()
min_value=self.X[:,feature_index].min()
mean_value=self.X[:,feature_index].mean()
std_value=self.X[:,feature_index].std()
var_value=self.X[:,feature_index].var()
skewness=stats.skew(self.X[:,feature_index])
kurtosis=stats.kurtosis(self.X[:,feature_index],fisher=True)
chi2,chi_p_val=chi2_feature_test(self.X,self.y,int(feature_index))
H_kw,kw_p_val=kw_feature_test(self.X,self.y,int(feature_index))
info_gain=information_gain(self.X,self.y,int(feature_index))
gain_rt=gain_ratio(self.X,self.y,int(feature_index))
except:
QtWidgets.QMessageBox.information(self, "Wrong Index","Feature Index Out Of Bounds.")
return
feature_stats="""Statistics:\n\nMinimum Value: """+str(min_value)\
+"""\n\nMaximum Value: """+str(max_value)\
+"""\n\nMean: """+str(mean_value)\
+"""\n\nStandard Deviation: """+str(std_value)\
+"""\n\nVariance: """+str(var_value)\
+"""\n\nSkewness: """+str(skewness)\
+"""\n\nKurtosis: """+str(kurtosis)\
+"""\n\nChi Squared Test: """+str(chi2[0])\
+"""\n\nKruskal-Wallis Test: """+str(H_kw)\
+"""\n\nInformation Gain: """+str(info_gain)\
+"""\n\nGain Ratio: """+str(gain_rt)
self.feature_stats.setText(feature_stats)