def calculate_aggregate(values):
agg_measures = {
'avg': np.mean(values),
'std': np.std(values),
'var': np.var(values),
'med': np.median(values),
'10p': np.percentile(values, 10),
'25p': np.percentile(values, 25),
'50p': np.percentile(values, 50),
'75p': np.percentile(values, 75),
'90p': np.percentile(values, 90),
'iqr': np.percentile(values, 75) - np.percentile(values, 25),
'iqm': interquartile_range_mean(values),
'mad': mean_absolute_deviation(values),
'cov': 1.0 * np.mean(values) / np.std(values),
'gin': gini_coefficient(values),
'skw': stats.skew(values),
'kur': stats.kurtosis(values),
'sum': np.sum(values)
}
return agg_measures
python类kurtosis()的实例源码
calculate_aggregate_statistics.py 文件源码
项目:tbp-next-basket
作者: GiulioRossetti
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def calculate_aggregate(values):
agg_measures = {
'avg': np.mean(values),
'std': np.std(values),
'var': np.var(values),
'med': np.median(values),
'10p': np.percentile(values, 10),
'25p': np.percentile(values, 25),
'50p': np.percentile(values, 50),
'75p': np.percentile(values, 75),
'90p': np.percentile(values, 90),
'iqr': np.percentile(values, 75) - np.percentile(values, 25),
'iqm': interquartile_range_mean(values),
'mad': mean_absolute_deviation(values),
'cov': 1.0 * np.mean(values) / np.std(values),
'gin': gini_coefficient(values),
'skw': stats.skew(values),
'kur': stats.kurtosis(values)
}
return agg_measures
test_analytics.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def test_kurt(self):
tm._skip_if_no_scipy()
from scipy.stats import kurtosis
alt = lambda x: kurtosis(x, bias=False)
self._check_stat_op('kurt', alt)
index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2],
[0, 1, 0, 1, 0, 1]])
s = Series(np.random.randn(6), index=index)
self.assertAlmostEqual(s.kurt(), s.kurt(level=0)['bar'])
# test corner cases, kurt() returns NaN unless there's at least 4
# values
min_N = 4
for i in range(1, min_N + 1):
s = Series(np.ones(i))
df = DataFrame(np.ones((i, i)))
if i < min_N:
self.assertTrue(np.isnan(s.kurt()))
self.assertTrue(np.isnan(df.kurt()).all())
else:
self.assertEqual(0, s.kurt())
self.assertTrue((df.kurt() == 0).all())
test_analytics.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def test_kurt(self):
tm._skip_if_no_scipy()
from scipy.stats import kurtosis
def alt(x):
if len(x) < 4:
return np.nan
return kurtosis(x, bias=False)
self._check_stat_op('kurt', alt)
index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
labels=[[0, 0, 0, 0, 0, 0],
[0, 1, 2, 0, 1, 2],
[0, 1, 0, 1, 0, 1]])
df = DataFrame(np.random.randn(6, 3), index=index)
kurt = df.kurt()
kurt2 = df.kurt(level=0).xs('bar')
assert_series_equal(kurt, kurt2, check_names=False)
self.assertTrue(kurt.name is None)
self.assertEqual(kurt2.name, 'bar')
def plot_hist(item, figure_id=1):
pt.figure(figure_id)
kurtosis = -np.ones(8)
for i in range(item.shape[1]):
pt.subplot(240+i)
tmp = item[item[:,i]!=-1,i]
tmp = tmp + np.random.rand(len(tmp)) - 0.5
pt.hist(tmp, bins=6, normed=True, range=(0.9,6.1), alpha=0.8, color=colorc[i])
pt.title(name[i])
density = kde.gaussian_kde(tmp)
xgrid = np.linspace(0, 6, 100)
pt.plot(xgrid, density(xgrid), 'r-')
avg = np.mean(tmp)
sd = np.std(tmp)
pt.plot(xgrid, normpdf(xgrid,avg,sd))
pt.show()
kurtosis[i] = sps.kurtosis(item[item[:,i]!=-1,i])
return(kurtosis)
def _statistics(self):
data = self.tr.data
t = np.arange(0, self.delta * self.npts, self.delta)
m = len(data)
Nsta = int(self.t_win * self.sampling_rate)
# compute the short time average (STA)
kt = np.zeros(m, dtype='float64')
pad_kt = np.zeros(Nsta)
# Tricky: Construct a big window of length len(a)-nsta. Now move this
# window nsta points, i.e. the window "sees" every point in a at least
# once.
# Changed xrange to range as it is compatible in both python 2 & 3
for i in range(m): # window size to smooth over
kt[i] = abs(kurtosis(data[i-Nsta:i]))
kt[0:Nsta] = 0
return kt
def mfccPostProcess(directory,fileCount):
for count in range(fileCount):
print("{0}/{1}".format(count+1,fileCount))
for mfccext in mfccList:
mfcc = np.loadtxt(directory+str(count)+mfccext+".csv",delimiter=",")
dmfcc = librosa.feature.delta(mfcc)
result = np.zeros((mfcc.shape[1],14))
result[:,0] = np.mean(mfcc, axis=0)
result[:,1] = np.var(mfcc, axis=0, dtype=np.float64)
result[:,2] = stats.skew(mfcc, axis=0)
result[:,3] = stats.kurtosis(mfcc, axis=0, fisher=False)
result[:,4] = np.median(mfcc, axis=0)
result[:,5] = np.min(mfcc, axis=0)
result[:,6] = np.max(mfcc, axis=0)
result[:,7] = np.mean(dmfcc, axis=0)
result[:,8] = np.var(dmfcc, axis=0, dtype=np.float64)
result[:,9] = stats.skew(dmfcc, axis=0)
result[:,10] = stats.kurtosis(dmfcc, axis=0, fisher=False)
result[:,11] = np.median(dmfcc, axis=0)
result[:,12] = np.min(dmfcc, axis=0)
result[:,13] = np.max(dmfcc, axis=0)
result[np.where(np.isnan(result))] = 0
np.savetxt(directory+str(count)+mfccext+"_stat.txt",result.flatten("F"),delimiter=",")
def feat_eeg(signals):
"""
calculate the relative power as defined by Leangkvist (2012),
assuming signal is recorded with 100hz
"""
if signals.ndim == 1: signals = np.expand_dims(signals,0)
sfreq = use_sfreq
nsamp = float(signals.shape[1])
feats = np.zeros((signals.shape[0],9),dtype='float32')
# 5 FEATURE for freq babnds
w = (fft(signals,axis=1)).real
delta = np.sum(np.abs(w[:,np.arange(0.5*nsamp/sfreq,4*nsamp/sfreq, dtype=int)]),axis=1)
theta = np.sum(np.abs(w[:,np.arange(4*nsamp/sfreq,8*nsamp/sfreq, dtype=int)]),axis=1)
alpha = np.sum(np.abs(w[:,np.arange(8*nsamp/sfreq,13*nsamp/sfreq, dtype=int)]),axis=1)
beta = np.sum(np.abs(w[:,np.arange(13*nsamp/sfreq,20*nsamp/sfreq, dtype=int)]),axis=1)
gamma = np.sum(np.abs(w[:,np.arange(20*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),axis=1) # only until 50, because hz=100
spindle = np.sum(np.abs(w[:,np.arange(12*nsamp/sfreq,14*nsamp/sfreq, dtype=int)]),axis=1)
sum_abs_pow = delta + theta + alpha + beta + gamma + spindle
feats[:,0] = delta /sum_abs_pow
feats[:,1] = theta /sum_abs_pow
feats[:,2] = alpha /sum_abs_pow
feats[:,3] = beta /sum_abs_pow
feats[:,4] = gamma /sum_abs_pow
feats[:,5] = spindle /sum_abs_pow
feats[:,6] = np.log10(stats.kurtosis(signals, fisher=False, axis=1)) # kurtosis
feats[:,7] = np.log10(-np.sum([(x/nsamp)*(np.log(x/nsamp)) for x in np.apply_along_axis(lambda x: np.histogram(x, bins=8)[0], 1, signals)],axis=1)) # entropy.. yay, one line...
#feats[:,7] = np.polynomial.polynomial.polyfit(np.log(f[np.arange(0.5*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]), np.log(w[0,np.arange(0.5*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),1)
feats[:,8] = np.dot(np.array([3.5,4,5,7,30]),feats[:,0:5].T ) / (sfreq/2-0.5)
if np.any(feats==np.nan): print('NaN detected')
return np.nan_to_num(feats)
def feat_wavelet(signals):
"""
calculate the relative power as defined by Leangkvist (2012),
assuming signal is recorded with 100hz
"""
if signals.ndim == 1: signals = np.expand_dims(signals,0)
sfreq = use_sfreq
nsamp = float(signals.shape[1])
feats = np.zeros((signals.shape[0],8),dtype='float32')
# 5 FEATURE for freq babnds
w = (fft(signals,axis=1)).real
delta = np.sum(np.abs(w[:,np.arange(0.5*nsamp/sfreq,4*nsamp/sfreq, dtype=int)]),axis=1)
theta = np.sum(np.abs(w[:,np.arange(4*nsamp/sfreq,8*nsamp/sfreq, dtype=int)]),axis=1)
alpha = np.sum(np.abs(w[:,np.arange(8*nsamp/sfreq,13*nsamp/sfreq, dtype=int)]),axis=1)
beta = np.sum(np.abs(w[:,np.arange(13*nsamp/sfreq,20*nsamp/sfreq, dtype=int)]),axis=1)
gamma = np.sum(np.abs(w[:,np.arange(20*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),axis=1) # only until 50, because hz=100
sum_abs_pow = delta + theta + alpha + beta + gamma
feats[:,0] = delta /sum_abs_pow
feats[:,1] = theta /sum_abs_pow
feats[:,2] = alpha /sum_abs_pow
feats[:,3] = beta /sum_abs_pow
feats[:,4] = gamma /sum_abs_pow
feats[:,5] = np.log10(stats.kurtosis(signals,fisher=False,axis=1)) # kurtosis
feats[:,6] = np.log10(-np.sum([(x/nsamp)*(np.log(x/nsamp)) for x in np.apply_along_axis(lambda x: np.histogram(x, bins=8)[0], 1, signals)],axis=1)) # entropy.. yay, one line...
#feats[:,7] = np.polynomial.polynomial.polyfit(np.log(f[np.arange(0.5*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]), np.log(w[0,np.arange(0.5*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),1)
feats[:,7] = np.dot(np.array([3.5,4,5,7,30]),feats[:,0:5].T ) / (sfreq/2-0.5)
if np.any(feats==np.nan): print('NaN detected')
return np.nan_to_num(feats)
def feat_eog(signals):
"""
calculate the EOG features
:param signals: 1D or 2D signals
"""
if signals.ndim == 1: signals = np.expand_dims(signals,0)
sfreq = use_sfreq
nsamp = float(signals.shape[1])
w = (fft(signals,axis=1)).real
feats = np.zeros((signals.shape[0],15),dtype='float32')
delta = np.sum(np.abs(w[:,np.arange(0.5*nsamp/sfreq,4*nsamp/sfreq, dtype=int)]),axis=1)
theta = np.sum(np.abs(w[:,np.arange(4*nsamp/sfreq,8*nsamp/sfreq, dtype=int)]),axis=1)
alpha = np.sum(np.abs(w[:,np.arange(8*nsamp/sfreq,13*nsamp/sfreq, dtype=int)]),axis=1)
beta = np.sum(np.abs(w[:,np.arange(13*nsamp/sfreq,20*nsamp/sfreq, dtype=int)]),axis=1)
gamma = np.sum(np.abs(w[:,np.arange(20*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),axis=1) # only until 50, because hz=100
sum_abs_pow = delta + theta + alpha + beta + gamma
feats[:,0] = delta /sum_abs_pow
feats[:,1] = theta /sum_abs_pow
feats[:,2] = alpha /sum_abs_pow
feats[:,3] = beta /sum_abs_pow
feats[:,4] = gamma /sum_abs_pow
feats[:,5] = np.dot(np.array([3.5,4,5,7,30]),feats[:,0:5].T ) / (sfreq/2-0.5) #smean
feats[:,6] = np.sqrt(np.max(signals, axis=1)) #PAV
feats[:,7] = np.sqrt(np.abs(np.min(signals, axis=1))) #VAV
feats[:,8] = np.argmax(signals, axis=1)/nsamp #PAP
feats[:,9] = np.argmin(signals, axis=1)/nsamp #VAP
feats[:,10] = np.sqrt(np.sum(np.abs(signals), axis=1)/ np.mean(np.sum(np.abs(signals), axis=1))) # AUC
feats[:,11] = np.sum(((np.roll(np.sign(signals), 1,axis=1) - np.sign(signals)) != 0).astype(int),axis=1)/nsamp #TVC
feats[:,12] = np.log10(np.std(signals, axis=1)) #STD/VAR
feats[:,13] = np.log10(stats.kurtosis(signals,fisher=False,axis=1)) # kurtosis
feats[:,14] = np.log10(-np.sum([(x/nsamp)*((np.log((x+np.spacing(1))/nsamp))) for x in np.apply_along_axis(lambda x: np.histogram(x, bins=8)[0], 1, signals)],axis=1)) # entropy.. yay, one line...
if np.any(feats==np.nan): print('NaN detected')
return np.nan_to_num(feats)
def feat_emg(signals):
"""
calculate the EMG median as defined by Leangkvist (2012),
"""
if signals.ndim == 1: signals = np.expand_dims(signals,0)
sfreq = use_sfreq
nsamp = float(signals.shape[1])
w = (fft(signals,axis=1)).real
feats = np.zeros((signals.shape[0],13),dtype='float32')
delta = np.sum(np.abs(w[:,np.arange(0.5*nsamp/sfreq,4*nsamp/sfreq, dtype=int)]),axis=1)
theta = np.sum(np.abs(w[:,np.arange(4*nsamp/sfreq,8*nsamp/sfreq, dtype=int)]),axis=1)
alpha = np.sum(np.abs(w[:,np.arange(8*nsamp/sfreq,13*nsamp/sfreq, dtype=int)]),axis=1)
beta = np.sum(np.abs(w[:,np.arange(13*nsamp/sfreq,20*nsamp/sfreq, dtype=int)]),axis=1)
gamma = np.sum(np.abs(w[:,np.arange(20*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),axis=1) # only until 50, because hz=100
sum_abs_pow = delta + theta + alpha + beta + gamma
feats[:,0] = delta /sum_abs_pow
feats[:,1] = theta /sum_abs_pow
feats[:,2] = alpha /sum_abs_pow
feats[:,3] = beta /sum_abs_pow
feats[:,4] = gamma /sum_abs_pow
feats[:,5] = np.dot(np.array([3.5,4,5,7,30]),feats[:,0:5].T ) / (sfreq/2-0.5) #smean
emg = np.sum(np.abs(w[:,np.arange(12.5*nsamp/sfreq,32*nsamp/sfreq, dtype=int)]),axis=1)
feats[:,6] = emg / np.sum(np.abs(w[:,np.arange(8*nsamp/sfreq,32*nsamp/sfreq, dtype=int)]),axis=1) # ratio of high freq to total motor
feats[:,7] = np.median(np.abs(w[:,np.arange(8*nsamp/sfreq,32*nsamp/sfreq, dtype=int)]),axis=1) # median freq
feats[:,8] = np.mean(np.abs(w[:,np.arange(8*nsamp/sfreq,32*nsamp/sfreq, dtype=int)]),axis=1) # mean freq
feats[:,9] = np.std(signals, axis=1) # std
feats[:,10] = np.mean(signals,axis=1)
feats[:,11] = np.log10(stats.kurtosis(signals,fisher=False,axis=1) )
feats[:,12] = np.log10(-np.sum([(x/nsamp)*((np.log((x+np.spacing(1))/nsamp))) for x in np.apply_along_axis(lambda x: np.histogram(x, bins=8)[0], 1, signals)],axis=1)) # entropy.. yay, one line...
if np.any(feats==np.nan): print('NaN detected')
return np.nan_to_num(feats)
def __init__(self, s, lags=0, kurtosis='adapt',
learningRate=1.5, tolerance=1.0e-6, maxIter=10000,
callback=None, verbose=False, *args, **kwargs):
STrans.__init__(self, s, lags=lags, *args, **kwargs)
self.train(s, kurtosis=kurtosis,
learningRate=learningRate,
tolerance=tolerance, maxIter=maxIter,
callback=callback, verbose=verbose)
def demoICA():
t = np.linspace(0.0, 30*np.pi, 1000)
s1 = spsig.sawtooth(t)
s2 = np.cos(5.0*t)
s3 = np.random.uniform(-1.0, 1.0, size=t.size)
s = np.vstack((s1,s2,s3)).T
m = np.random.random((3,3))
m /= m.sum(axis=0)
sMixed = s.dot(m)
icaFilt = ICA(sMixed, kurtosis='sub', verbose=True)
fig = plt.figure()
axOrig = fig.add_subplot(4,1, 1)
axOrig.plot(s+util.colsep(s))
axOrig.set_title('Unmixed Signal')
axOrig.autoscale(tight=True)
axMixed = fig.add_subplot(4,1, 2)
axMixed.plot(sMixed+util.colsep(sMixed))
axMixed.set_title('Mixed Signal (random transform)')
axMixed.autoscale(tight=True)
axUnmixed = fig.add_subplot(4,1, 3)
icaFilt.plotTransform(sMixed, ax=axUnmixed)
axUnmixed.set_title('ICA Components')
axUnmixed.autoscale(tight=True)
axCleaned = fig.add_subplot(4,1, 4)
icaFilt.plotFilter(sMixed, comp=(0,1,), ax=axCleaned)
axCleaned.set_title('Cleaned Signal (First two components kept)')
axCleaned.autoscale(tight=True)
fig.tight_layout()
test_window.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def test_rolling_kurt(self):
try:
from scipy.stats import kurtosis
except ImportError:
raise nose.SkipTest('no scipy')
self._check_moment_func(mom.rolling_kurt,
lambda x: kurtosis(x, bias=False), name='kurt')
test_nanops.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def test_nankurt(self):
tm.skip_if_no_package('scipy.stats')
tm._skip_if_scipy_0_17()
from scipy.stats import kurtosis
func1 = partial(kurtosis, fisher=True)
func = partial(self._skew_kurt_wrap, func=func1)
self.check_funs(nanops.nankurt, func, allow_complex=False,
allow_str=False, allow_date=False, allow_tdelta=False)
test_nanops.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def setUp(self):
# Test data + kurtosis value (computed with scipy.stats.kurtosis)
self.samples = np.sin(np.linspace(0, 1, 200))
self.actual_kurt = -1.2058303433799713
def statistical_metrics(x):
"""
Calculates statistical metrics on input array (mean, std, skew, kurtosis).
"""
metrics = {
'mean': np.mean,
'stdev': np.std,
'skew': stats.skew,
'kurtosis': stats.kurtosis
}
return {k: fn(x.flatten()) for k, fn in metrics.items()}
def test_gen_usr_distrib(n_samples=100000, verbose=False):
rng = np.random.RandomState(0)
xs = _gen_usr_distrib(n_samples, ['laplace'], rng)
assert_allclose(np.mean(xs), 0, atol=5e-2)
assert_allclose(np.std(xs), 1, atol=5e-2)
assert_allclose(skew(xs)[0], 0, atol=5e-2)
assert_allclose(kurtosis(xs)[0], 3, atol=5e-2)
xs = _gen_usr_distrib(n_samples, ['exp'], rng)
assert_allclose(np.std(xs), 1, atol=5e-2)
def get_features(df_features):
print('use w2v to document presentation')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge_tfidf(x['question1'], x['question2']), axis = 1)
print('nones')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)
#df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
#df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
df_features['question1_w2v'] = df_features.question1.map(lambda x: get_vector_tfidf(" ".join(x)))
df_features['question2_w2v'] = df_features.question2.map(lambda x: get_vector_tfidf(" ".join(x)))
print('z_dist')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_tfidf_cos_sim')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_w2v_nones')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim_tfidf(x['q1_unique'], x['q2_unique']), axis=1)
df_features['z_w2v_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['question1_w2v'], x['question2_w2v']), axis=1)
df_features['z_w2v_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['question1_w2v'], x['question2_w2v'],3), axis=1)
df_features['z_w2v_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['question1_w2v'], x['question2_w2v']), axis=1)
df_features['z_w2v_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['question1_w2v'], x['question2_w2v']), axis=1)
df_features['z_q1_skew'] = df_features.question1_w2v.map(lambda x:skew(x))
df_features['z_q2_skew'] = df_features.question2_w2v.map(lambda x:skew(x))
df_features['z_q1_kur'] = df_features.question1_w2v.map(lambda x:kurtosis(x))
df_features['z_q2_kur'] = df_features.question2_w2v.map(lambda x:kurtosis(x))
del df_features['question1_w2v']
del df_features['question2_w2v']
print('all done')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features.fillna(0.0)
return df_features
def _get_grid_size(data, use_default_square=False):
"""
Calculate the size of the grid.
Parameters
----------
data: array-like
The normalized data.
use_default_square: bool
Define the grid as the minimal possible square.
Returns
-------
int, int
The width and height of the grid.
"""
# if the grid would be square, this is the minimum size
sqr_size = int(np.ceil(np.sqrt(len(data))))
size_x = size_y = sqr_size
if not use_default_square:
kurt = kurtosis(data)
kurt_x, kurt_y = np.int32(np.abs(np.ceil(kurt * 2)))
size_x += kurt_x
size_y += kurt_y
return size_x, size_y
def features(self, q1, q2):
q1 = str(q1).lower().split()
q2 = str(q2).lower().split()
q1 = [w for w in q1 if w not in stopwords]
q2 = [w for w in q2 if w not in stopwords]
wmd = min(self.model.wmdistance(q1, q2), 10)
q1vec = self.sent2vec(q1)
q2vec = self.sent2vec(q2)
if q1vec is not None and q2vec is not None:
cos = cosine(q1vec, q2vec)
city = cityblock(q1vec, q2vec)
jacc = jaccard(q1vec, q2vec)
canb = canberra(q1vec, q2vec)
eucl = euclidean(q1vec, q2vec)
mink = minkowski(q1vec, q2vec, 3)
bray = braycurtis(q1vec, q2vec)
q1_skew = skew(q1vec)
q2_skew = skew(q2vec)
q1_kurt = kurtosis(q1vec)
q2_kurt = kurtosis(q2vec)
else:
cos = -1
city = -1
jacc = -1
canb = -1
eucl = -1
mink = -1
bray = -1
q1_skew = 0
q2_skew = 0
q1_kurt = 0
q2_kurt = 0
return wmd, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt
def features(self, q1, q2):
q1 = str(q1).lower().split()
q2 = str(q2).lower().split()
q1 = [w for w in q1 if w not in stopwords]
q2 = [w for w in q2 if w not in stopwords]
wmd = min(self.model.wmdistance(q1, q2), 10)
wmd_norm = min(self.model_norm.wmdistance(q1, q2), 10)
q1vec = self.sent2vec(q1)
q2vec = self.sent2vec(q2)
if q1vec is not None and q2vec is not None:
cos = cosine(q1vec, q2vec)
city = cityblock(q1vec, q2vec)
jacc = jaccard(q1vec, q2vec)
canb = canberra(q1vec, q2vec)
eucl = euclidean(q1vec, q2vec)
mink = minkowski(q1vec, q2vec, 3)
bray = braycurtis(q1vec, q2vec)
q1_skew = skew(q1vec)
q2_skew = skew(q2vec)
q1_kurt = kurtosis(q1vec)
q2_kurt = kurtosis(q2vec)
else:
cos = -1
city = -1
jacc = -1
canb = -1
eucl = -1
mink = -1
bray = -1
q1_skew = 0
q2_skew = 0
q1_kurt = 0
q2_kurt = 0
return wmd, wmd_norm, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt
def lightcurve_moments(ftimes, fmags, ferrs):
'''This calculates the weighted mean, stdev, median, MAD, percentiles, skew,
kurtosis, fraction of LC beyond 1-stdev, and IQR.
'''
ndet = len(fmags)
if ndet > 9:
# now calculate the various things we need
series_median = npmedian(fmags)
series_wmean = (
npsum(fmags*(1.0/(ferrs*ferrs)))/npsum(1.0/(ferrs*ferrs))
)
series_mad = npmedian(npabs(fmags - series_median))
series_stdev = 1.483*series_mad
series_skew = spskew(fmags)
series_kurtosis = spkurtosis(fmags)
# get the beyond1std fraction
series_above1std = len(fmags[fmags > (series_median + series_stdev)])
series_below1std = len(fmags[fmags < (series_median - series_stdev)])
# this is the fraction beyond 1 stdev
series_beyond1std = (series_above1std + series_below1std)/float(ndet)
# get the magnitude percentiles
series_mag_percentiles = nppercentile(
fmags,
[5.0,10,17.5,25,32.5,40,60,67.5,75,82.5,90,95]
)
return {
'median':series_median,
'wmean':series_wmean,
'mad':series_mad,
'stdev':series_stdev,
'skew':series_skew,
'kurtosis':series_kurtosis,
'beyond1std':series_beyond1std,
'mag_percentiles':series_mag_percentiles,
'mag_iqr': series_mag_percentiles[8] - series_mag_percentiles[3],
}
else:
LOGERROR('not enough detections in this magseries '
'to calculate light curve moments')
return None
def create_scipy_features(base_features, sentinel):
r"""Calculate the skew, kurtosis, and other statistical features
for each row.
Parameters
----------
base_features : numpy array
The feature dataframe.
sentinel : float
The number to be imputed for NaN values.
Returns
-------
sp_features : numpy array
The calculated SciPy features.
"""
logger.info("Creating SciPy Features")
# Generate scipy features
logger.info("SciPy Feature: geometric mean")
row_gmean = sps.gmean(base_features, axis=1)
logger.info("SciPy Feature: kurtosis")
row_kurtosis = sps.kurtosis(base_features, axis=1)
logger.info("SciPy Feature: kurtosis test")
row_ktest, pvalue = sps.kurtosistest(base_features, axis=1)
logger.info("SciPy Feature: normal test")
row_normal, pvalue = sps.normaltest(base_features, axis=1)
logger.info("SciPy Feature: skew")
row_skew = sps.skew(base_features, axis=1)
logger.info("SciPy Feature: skew test")
row_stest, pvalue = sps.skewtest(base_features, axis=1)
logger.info("SciPy Feature: variation")
row_var = sps.variation(base_features, axis=1)
logger.info("SciPy Feature: signal-to-noise ratio")
row_stn = sps.signaltonoise(base_features, axis=1)
logger.info("SciPy Feature: standard error of mean")
row_sem = sps.sem(base_features, axis=1)
sp_features = np.column_stack((row_gmean, row_kurtosis, row_ktest,
row_normal, row_skew, row_stest,
row_var, row_stn, row_sem))
sp_features = impute_values(sp_features, 'float64', sentinel)
sp_features = StandardScaler().fit_transform(sp_features)
# Return new SciPy features
logger.info("SciPy Feature Count : %d", sp_features.shape[1])
return sp_features
#
# Function create_clusters
#
def train(self, s, kurtosis, learningRate, tolerance, maxIter, callback, verbose):
s = self.prep(s)
wPrev = np.empty(self.w.shape)
grad = np.empty((self.nComp, self.nComp))
I = np.eye(self.nComp, dtype=self.dtype)
n = 1.0/s.shape[0]
iteration = 0
while True:
y = s.dot(self.w)
if kurtosis == 'sub':
k = -1
elif kurtosis == 'super':
k = 1
elif kurtosis == 'adapt':
#k = np.sign(np.mean(1.0-util.fastTanh(y)**2, axis=0) *
# np.mean(y**2, axis=0) -
# np.mean(y*util.fastTanh(y), axis=0))
k = np.sign(spstat.kurtosis(y, axis=0))
k[np.isclose(k,0.0)] = -1.0
grad[...] = (I - k*util.fastTanh(y).T.dot(y) - y.T.dot(y)).T.dot(self.w) * n
wPrev[...] = self.w
self.w += learningRate * grad
wtol = np.max(np.abs(wPrev-self.w))
if verbose:
print '%d %6f' % (iteration, wtol)
if callback is not None:
callback(iteration, wtol)
if wtol < tolerance:
self.reason = 'tolerance'
break
elif np.max(np.abs(self.w)) > 1.0e100:
self.reason = 'diverge'
break
if iteration >= maxIter:
self.reason = 'maxiter'
break
iteration += 1
if verbose:
print 'Reason: ' + self.reason
self.w /= np.sqrt(np.sum(self.w**2, axis=0))
self.wInv[...] = np.linalg.pinv(self.w)
def compute_basic_descriptives(df, selected_features):
"""
Compute basic descriptive statistics for the columns
in the given data frame.
Parameters
----------
df : pandas DataFrame
Input data frame containing the feature values.
selected_features : list of str
List of feature names for which to compute
the descriptives.
Returns
-------
df_desc : pandas DataFrame
Data frame containing the descriptives for
each of the features.
"""
# select only feature columns
df_desc = df[selected_features]
# get the H1 scores
scores = df['sc1']
# compute correlations and p-values separately for efficiency
cor_series = df_desc.apply(lambda s: pearsonr(s, scores))
cors = cor_series.apply(lambda t: t[0])
pvalues = cor_series.apply(lambda t: t[1])
# create a data frame with all the descriptives
df_output = pd.DataFrame({'mean': df_desc.mean(),
'min': df_desc.min(),
'max': df_desc.max(),
'std. dev.': df_desc.std(),
'skewness': df_desc.skew(),
'kurtosis': df_desc.apply(lambda s: kurtosis(s, fisher=False)),
'Correlation': cors,
'p': pvalues,
'N': len(df_desc)})
# reorder the columns to make it look better
df_output = df_output[['mean', 'std. dev.', 'min', 'max',
'skewness', 'kurtosis', 'Correlation',
'p', 'N']]
return df_output
def get_features(df_features):
print('use w2v to document presentation')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge(x['question1'], x['question2']), axis = 1)
print('get_w2v')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)
df_features['q1_unique_w2v_weight'] = df_features.q1_unique.map(lambda x: get_vector(" ".join(x)))
df_features['q2_unique_w2v_weight'] = df_features.q2_unique.map(lambda x: get_vector(" ".join(x)))
df_features['q1_unique_w2v'] = df_features.q1_unique.map(lambda x: get_weight_vector(" ".join(x)))
df_features['q2_unique_w2v'] = df_features.q2_unique.map(lambda x: get_weight_vector(" ".join(x)))
print('z_dist')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_tfidf_cos_sim')
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_w2v_calc')
print now.strftime('%Y-%m-%d %H:%M:%S')
#df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim(x['q1_unique'], x['q2_unique']), axis=1)
df_features['z_w2v_unique_dis_e_weight'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
df_features['z_w2v_unique_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
df_features['z_w2v_unique_dis_mink_w'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight'],3), axis=1)
df_features['z_w2v_unique_dis_cityblock_w'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
df_features['z_w2v_unique_dis_canberra_w'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
df_features['z_w2v_unique_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v'], x['q2_unique_w2v'],3), axis=1)
df_features['z_w2v_unique_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
df_features['z_w2v_unique_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
df_features['z_q1_unique_skew_w'] = df_features.q1_unique_w2v_weight.map(lambda x:skew(x))
df_features['z_q2_unique_skew_w'] = df_features.q2_unique_w2v_weight.map(lambda x:skew(x))
df_features['z_q1_unique_kur_w'] = df_features.q1_unique_w2v_weight.map(lambda x:kurtosis(x))
df_features['z_q2_unique_kur_w'] = df_features.q2_unique_w2v_weight.map(lambda x:kurtosis(x))
df_features['z_q1_unique_skew'] = df_features.q1_unique_w2v.map(lambda x:skew(x))
df_features['z_q2_unique_skew'] = df_features.q2_unique_w2v.map(lambda x:skew(x))
df_features['z_q1_unique_kur'] = df_features.q1_unique_w2v.map(lambda x:kurtosis(x))
df_features['z_q2_unique_kur'] = df_features.q2_unique_w2v.map(lambda x:kurtosis(x))
del df_features['q1_unique_w2v_weight']
del df_features['q2_unique_w2v_weight']
del df_features['q1_unique_w2v']
del df_features['q2_unique_w2v']
print('all done')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features.fillna(0.0)
return df_features
def _detect_artifacts(ica, raw, start_find, stop_find, ecg_ch, ecg_score_func,
ecg_criterion, eog_ch, eog_score_func, eog_criterion,
skew_criterion, kurt_criterion, var_criterion,
add_nodes):
"""Aux Function"""
from scipy import stats
nodes = []
if ecg_ch is not None:
nodes += [_ica_node('ECG', ecg_ch, ecg_score_func, ecg_criterion)]
if eog_ch not in [None, []]:
if not isinstance(eog_ch, list):
eog_ch = [eog_ch]
for idx, ch in enumerate(eog_ch):
nodes += [_ica_node('EOG %02d' % idx, ch, eog_score_func,
eog_criterion)]
if skew_criterion is not None:
nodes += [_ica_node('skewness', None, stats.skew, skew_criterion)]
if kurt_criterion is not None:
nodes += [_ica_node('kurtosis', None, stats.kurtosis, kurt_criterion)]
if var_criterion is not None:
nodes += [_ica_node('variance', None, np.var, var_criterion)]
if add_nodes is not None:
nodes.extend(add_nodes)
for node in nodes:
scores = ica.score_sources(raw, start=start_find, stop=stop_find,
target=node.target,
score_func=node.score_func)
if isinstance(node.criterion, float):
found = list(np.where(np.abs(scores) > node.criterion)[0])
else:
found = list(np.atleast_1d(abs(scores).argsort()[node.criterion]))
case = (len(found), 's' if len(found) > 1 else '', node.name)
logger.info(' found %s artifact%s by %s' % case)
ica.exclude += found
logger.info('Artifact indices found:\n ' + str(ica.exclude).strip('[]'))
if len(set(ica.exclude)) != len(ica.exclude):
logger.info(' Removing duplicate indices...')
ica.exclude = list(set(ica.exclude))
logger.info('Ready.')
default_predictor.py 文件源码
项目:Default-Credit-Card-Prediction
作者: AlexPnt
项目源码
文件源码
阅读 40
收藏 0
点赞 0
评论 0
def get_feature_stats(self):
# #get input feature
feature_input=self.feature_input.currentText()
try:
if feature_input[0]=='X':
try:
feature_index=int("".join(feature_input[1:]))
feature_index-=1
except:
QtWidgets.QMessageBox.information(self, "Wrong Format","Please enter a feature name in the format: X%d.")
return
elif "".join(feature_input[0]+feature_input[1])=='LD' or "".join(feature_input[0]+feature_input[1])=='PC':
try:
feature_index=int("".join(feature_input[2:]))
feature_index-=1
except:
QtWidgets.QMessageBox.information(self, "Wrong Format","Please enter a feature name in the format: X||LD||PC%d.")
return
else:
QtWidgets.QMessageBox.information(self, "Wrong Format","Feature names must be in the format: X%d.")
return
except:
QtWidgets.QMessageBox.information(self, "Data Not Found","Please load a dataset first.")
return
try:
max_value=self.X[:,feature_index].max()
min_value=self.X[:,feature_index].min()
mean_value=self.X[:,feature_index].mean()
std_value=self.X[:,feature_index].std()
var_value=self.X[:,feature_index].var()
skewness=stats.skew(self.X[:,feature_index])
kurtosis=stats.kurtosis(self.X[:,feature_index],fisher=True)
chi2,chi_p_val=chi2_feature_test(self.X,self.y,int(feature_index))
H_kw,kw_p_val=kw_feature_test(self.X,self.y,int(feature_index))
info_gain=information_gain(self.X,self.y,int(feature_index))
gain_rt=gain_ratio(self.X,self.y,int(feature_index))
except:
QtWidgets.QMessageBox.information(self, "Wrong Index","Feature Index Out Of Bounds.")
return
feature_stats="""Statistics:\n\nMinimum Value: """+str(min_value)\
+"""\n\nMaximum Value: """+str(max_value)\
+"""\n\nMean: """+str(mean_value)\
+"""\n\nStandard Deviation: """+str(std_value)\
+"""\n\nVariance: """+str(var_value)\
+"""\n\nSkewness: """+str(skewness)\
+"""\n\nKurtosis: """+str(kurtosis)\
+"""\n\nChi Squared Test: """+str(chi2[0])\
+"""\n\nKruskal-Wallis Test: """+str(H_kw)\
+"""\n\nInformation Gain: """+str(info_gain)\
+"""\n\nGain Ratio: """+str(gain_rt)
self.feature_stats.setText(feature_stats)