def data_preprocess(train,test):
outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
train.drop(train.index[outlier_idx],inplace=True)
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
test.loc[:,'MSSubClass':'SaleCondition']))
to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
all_data = all_data.drop(to_delete,axis=1)
train["SalePrice"] = np.log1p(train["SalePrice"])
#log transform skewed numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
return X_train,X_test,y
python类skew()的实例源码
def data_preprocess(train,test):
outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
train.drop(train.index[outlier_idx],inplace=True)
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
test.loc[:,'MSSubClass':'SaleCondition']))
to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
all_data = all_data.drop(to_delete,axis=1)
train["SalePrice"] = np.log1p(train["SalePrice"])
#log transform skewed numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
return X_train,X_test,y
def data_preprocess(train, test):
outlier_idx = [4, 11, 13, 20, 46, 66, 70, 167, 178, 185, 199, 224, 261, 309, 313, 318, 349, 412, 423, 440, 454, 477,
478, 523, 540, 581, 588, 595, 654, 688, 691, 774, 798, 875, 898, 926, 970, 987, 1027, 1109, 1169,
1182, 1239, 1256, 1298, 1324, 1353, 1359, 1405, 1442, 1447]
train.drop(train.index[outlier_idx], inplace=True)
all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
test.loc[:, 'MSSubClass':'SaleCondition']))
to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
all_data = all_data.drop(to_delete, axis=1)
train["SalePrice"] = np.log1p(train["SalePrice"])
# log transform skewed numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) # compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(method='ffill')
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
return X_train, X_test, y
calculate_aggregate_statistics.py 文件源码
项目:tbp-next-basket
作者: GiulioRossetti
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def calculate_aggregate(values):
agg_measures = {
'avg': np.mean(values),
'std': np.std(values),
'var': np.var(values),
'med': np.median(values),
'10p': np.percentile(values, 10),
'25p': np.percentile(values, 25),
'50p': np.percentile(values, 50),
'75p': np.percentile(values, 75),
'90p': np.percentile(values, 90),
'iqr': np.percentile(values, 75) - np.percentile(values, 25),
'iqm': interquartile_range_mean(values),
'mad': mean_absolute_deviation(values),
'cov': 1.0 * np.mean(values) / np.std(values),
'gin': gini_coefficient(values),
'skw': stats.skew(values),
'kur': stats.kurtosis(values),
'sum': np.sum(values)
}
return agg_measures
def calculate_aggregate(values):
agg_measures = {
'avg': np.mean(values),
'std': np.std(values),
'var': np.var(values),
'med': np.median(values),
'10p': np.percentile(values, 10),
'25p': np.percentile(values, 25),
'50p': np.percentile(values, 50),
'75p': np.percentile(values, 75),
'90p': np.percentile(values, 90),
'iqr': np.percentile(values, 75) - np.percentile(values, 25),
'iqm': interquartile_range_mean(values),
'mad': mean_absolute_deviation(values),
'cov': 1.0 * np.mean(values) / np.std(values),
'gin': gini_coefficient(values),
'skw': stats.skew(values),
'kur': stats.kurtosis(values)
}
return agg_measures
test_analytics.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def test_skew(self):
tm._skip_if_no_scipy()
from scipy.stats import skew
alt = lambda x: skew(x, bias=False)
self._check_stat_op('skew', alt)
# test corner cases, skew() returns NaN unless there's at least 3
# values
min_N = 3
for i in range(1, min_N + 1):
s = Series(np.ones(i))
df = DataFrame(np.ones((i, i)))
if i < min_N:
self.assertTrue(np.isnan(s.skew()))
self.assertTrue(np.isnan(df.skew()).all())
else:
self.assertEqual(0, s.skew())
self.assertTrue((df.skew() == 0).all())
test_panel.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def test_skew(self):
try:
from scipy.stats import skew
except ImportError:
raise nose.SkipTest("no scipy.stats.skew")
def this_skew(x):
if len(x) < 3:
return np.nan
return skew(x, bias=False)
self._check_stat_op('skew', this_skew)
# def test_mad(self):
# f = lambda x: np.abs(x - x.mean()).mean()
# self._check_stat_op('mad', f)
test_panel.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def test_sem(self):
def alt(x):
if len(x) < 2:
return np.nan
return np.std(x, ddof=1) / np.sqrt(len(x))
self._check_stat_op('sem', alt)
# def test_skew(self):
# from scipy.stats import skew
# def alt(x):
# if len(x) < 3:
# return np.nan
# return skew(x, bias=False)
# self._check_stat_op('skew', alt)
test_panel4d.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def test_sem(self):
def alt(x):
if len(x) < 2:
return np.nan
return np.std(x, ddof=1) / np.sqrt(len(x))
self._check_stat_op('sem', alt)
# def test_skew(self):
# from scipy.stats import skew
# def alt(x):
# if len(x) < 3:
# return np.nan
# return skew(x, bias=False)
# self._check_stat_op('skew', alt)
test_nanops.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 31
收藏 0
点赞 0
评论 0
def test_returned_dtype(self):
dtypes = [np.int16, np.int32, np.int64, np.float32, np.float64]
if hasattr(np, 'float128'):
dtypes.append(np.float128)
for dtype in dtypes:
s = Series(range(10), dtype=dtype)
group_a = ['mean', 'std', 'var', 'skew', 'kurt']
group_b = ['min', 'max']
for method in group_a + group_b:
result = getattr(s, method)()
if is_integer_dtype(dtype) and method in group_a:
self.assertTrue(
result.dtype == np.float64,
"return dtype expected from %s is np.float64, "
"got %s instead" % (method, result.dtype))
else:
self.assertTrue(
result.dtype == dtype,
"return dtype expected from %s is %s, "
"got %s instead" % (method, dtype, result.dtype))
def ka_display_skewnewss(data):
'''show skewness information
Parameters
----------
data: pandas dataframe
Return
------
df: pandas dataframe
'''
numeric_cols = data.columns[data.dtypes != 'object'].tolist()
skew_value = []
for i in numeric_cols:
skew_value += [skew(data[i])]
df = pd.concat(
[pd.Series(numeric_cols), pd.Series(data.dtypes[data.dtypes != 'object'].apply(lambda x: str(x)).values)
, pd.Series(skew_value)], axis=1)
df.columns = ['var_name', 'col_type', 'skew_value']
return df
def mfccPostProcess(directory,fileCount):
for count in range(fileCount):
print("{0}/{1}".format(count+1,fileCount))
for mfccext in mfccList:
mfcc = np.loadtxt(directory+str(count)+mfccext+".csv",delimiter=",")
dmfcc = librosa.feature.delta(mfcc)
result = np.zeros((mfcc.shape[1],14))
result[:,0] = np.mean(mfcc, axis=0)
result[:,1] = np.var(mfcc, axis=0, dtype=np.float64)
result[:,2] = stats.skew(mfcc, axis=0)
result[:,3] = stats.kurtosis(mfcc, axis=0, fisher=False)
result[:,4] = np.median(mfcc, axis=0)
result[:,5] = np.min(mfcc, axis=0)
result[:,6] = np.max(mfcc, axis=0)
result[:,7] = np.mean(dmfcc, axis=0)
result[:,8] = np.var(dmfcc, axis=0, dtype=np.float64)
result[:,9] = stats.skew(dmfcc, axis=0)
result[:,10] = stats.kurtosis(dmfcc, axis=0, fisher=False)
result[:,11] = np.median(dmfcc, axis=0)
result[:,12] = np.min(dmfcc, axis=0)
result[:,13] = np.max(dmfcc, axis=0)
result[np.where(np.isnan(result))] = 0
np.savetxt(directory+str(count)+mfccext+"_stat.txt",result.flatten("F"),delimiter=",")
def process(self, obj_data):
'''
Apply Skew analysis with results added to the data wrapper
@param obj_data: Data wrapper
'''
column_names = obj_data.getDefaultColumns()
results = defaultdict(dict)
# for label, frame in tqdm(obj_data.getIterator()):
for label, frame in obj_data.getIterator():
for column in column_names:
# dropping missing data in order to remove top and bottom 2%
data = frame[column].dropna()
# Remove top and bottom 2%
rem_num = round(len(data)*0.02)
res = skew(data.sort_values(ascending=True)[rem_num:-rem_num])
if isinstance(res, np.ma.masked_array):
res = np.float(res.data)
results[label][column] = res
obj_data.addResult(self.str_description, results)
two_sigma_financial_modelling.py 文件源码
项目:PortfolioTimeSeriesAnalysis
作者: MizioAnd
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def skew_correction(df, numerical_features):
# Skew correction
skewed_feats = df[numerical_features].apply(lambda x: skew(x.dropna())) # compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
df.loc[:, tuple(skewed_feats)] = np.log1p(np.asarray(df[skewed_feats], dtype=float))
def ideal_bin_count(data, method="default"):
"""A theoretically ideal bin count.
Parameters
----------
data: array_like or None
Data to work on. Most methods don't use this.
method: str
Name of the method to apply, available values:
- default (~sturges)
- sqrt
- sturges
- doane
- rice
See https://en.wikipedia.org/wiki/Histogram for the description
Returns
-------
int
Number of bins, always >= 1
"""
n = data.size
if n < 1:
return 1
if method == "default":
if n <= 32:
return 7
else:
return ideal_bin_count(data, "sturges")
elif method == "sqrt":
return int(np.ceil(np.sqrt(n)))
elif method == "sturges":
return int(np.ceil(np.log2(n)) + 1)
elif method == "doane":
if n < 3:
return 1
from scipy.stats import skew
sigma = np.sqrt(6 * (n-2) / (n + 1) * (n + 3))
return int(np.ceil(1 + np.log2(n) + np.log2(1 + np.abs(skew(data)) / sigma)))
elif method == "rice":
return int(np.ceil(2 * np.power(n, 1 / 3)))
def skew_correction(df, numerical_features):
# Skew correction
skewed_feats = df[numerical_features].apply(lambda x: skew(x.dropna())) # compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
df.loc[:, tuple(skewed_feats)] = np.log1p(np.asarray(df[skewed_feats], dtype=float))
# df[skewed_feats] = np.log1p(np.asarray(df[skewed_feats], dtype=float))
test_panel4d.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 33
收藏 0
点赞 0
评论 0
def test_skew(self):
try:
from scipy.stats import skew
except ImportError:
raise nose.SkipTest("no scipy.stats.skew")
def this_skew(x):
if len(x) < 3:
return np.nan
return skew(x, bias=False)
self._check_stat_op('skew', this_skew)
# def test_mad(self):
# f = lambda x: np.abs(x - x.mean()).mean()
# self._check_stat_op('mad', f)
test_analytics.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 30
收藏 0
点赞 0
评论 0
def test_skew(self):
tm._skip_if_no_scipy()
from scipy.stats import skew
def alt(x):
if len(x) < 3:
return np.nan
return skew(x, bias=False)
self._check_stat_op('skew', alt)
test_analytics.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 34
收藏 0
点赞 0
评论 0
def test_stats_mixed_type(self):
# don't blow up
self.mixed_frame.std(1)
self.mixed_frame.var(1)
self.mixed_frame.mean(1)
self.mixed_frame.skew(1)
test_window.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def test_how_compat(self):
# in prior versions, we would allow how to be used in the resample
# now that its deprecated, we need to handle this in the actual
# aggregation functions
s = pd.Series(
np.random.randn(20),
index=pd.date_range('1/1/2000', periods=20, freq='12H'))
for how in ['min', 'max', 'median']:
for op in ['mean', 'sum', 'std', 'var', 'kurt', 'skew']:
for t in ['rolling', 'expanding']:
with tm.assert_produces_warning(FutureWarning,
check_stacklevel=False):
dfunc = getattr(pd, "{0}_{1}".format(t, op))
if dfunc is None:
continue
if t == 'rolling':
kwargs = {'window': 5}
else:
kwargs = {}
result = dfunc(s, freq='D', how=how, **kwargs)
expected = getattr(
getattr(s, t)(freq='D', **kwargs), op)(how=how)
assert_series_equal(result, expected)
test_window.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def test_rolling_skew(self):
try:
from scipy.stats import skew
except ImportError:
raise nose.SkipTest('no scipy')
self._check_moment_func(mom.rolling_skew,
lambda x: skew(x, bias=False), name='skew')
test_nanops.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def test_nanskew(self):
tm.skip_if_no_package('scipy.stats')
tm._skip_if_scipy_0_17()
from scipy.stats import skew
func = partial(self._skew_kurt_wrap, func=skew)
self.check_funs(nanops.nanskew, func, allow_complex=False,
allow_str=False, allow_date=False, allow_tdelta=False)
test_nanops.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def setUp(self):
# Test data + skewness value (computed with scipy.stats.skew)
self.samples = np.sin(np.linspace(0, 1, 200))
self.actual_skew = -0.1875895205961754
test_nanops.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def test_constant_series(self):
# xref GH 11974
for val in [3075.2, 3075.3, 3075.5]:
data = val * np.ones(300)
skew = nanops.nanskew(data)
self.assertEqual(skew, 0.0)
test_nanops.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 32
收藏 0
点赞 0
评论 0
def test_ground_truth(self):
skew = nanops.nanskew(self.samples)
self.assertAlmostEqual(skew, self.actual_skew)
test_nanops.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 43
收藏 0
点赞 0
评论 0
def test_nans(self):
samples = np.hstack([self.samples, np.nan])
skew = nanops.nanskew(samples, skipna=False)
self.assertTrue(np.isnan(skew))
test_nanops.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 29
收藏 0
点赞 0
评论 0
def test_nans_skipna(self):
samples = np.hstack([self.samples, np.nan])
skew = nanops.nanskew(samples, skipna=True)
tm.assert_almost_equal(skew, self.actual_skew)
def statistical_metrics(x):
"""
Calculates statistical metrics on input array (mean, std, skew, kurtosis).
"""
metrics = {
'mean': np.mean,
'stdev': np.std,
'skew': stats.skew,
'kurtosis': stats.kurtosis
}
return {k: fn(x.flatten()) for k, fn in metrics.items()}
def test_gen_usr_distrib(n_samples=100000, verbose=False):
rng = np.random.RandomState(0)
xs = _gen_usr_distrib(n_samples, ['laplace'], rng)
assert_allclose(np.mean(xs), 0, atol=5e-2)
assert_allclose(np.std(xs), 1, atol=5e-2)
assert_allclose(skew(xs)[0], 0, atol=5e-2)
assert_allclose(kurtosis(xs)[0], 3, atol=5e-2)
xs = _gen_usr_distrib(n_samples, ['exp'], rng)
assert_allclose(np.std(xs), 1, atol=5e-2)
def get_features(df_features):
print('use w2v to document presentation')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge_tfidf(x['question1'], x['question2']), axis = 1)
print('nones')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)
#df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
#df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
df_features['question1_w2v'] = df_features.question1.map(lambda x: get_vector_tfidf(" ".join(x)))
df_features['question2_w2v'] = df_features.question2.map(lambda x: get_vector_tfidf(" ".join(x)))
print('z_dist')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_tfidf_cos_sim')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
now = datetime.datetime.now()
print('z_w2v_nones')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim_tfidf(x['q1_unique'], x['q2_unique']), axis=1)
df_features['z_w2v_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['question1_w2v'], x['question2_w2v']), axis=1)
df_features['z_w2v_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['question1_w2v'], x['question2_w2v'],3), axis=1)
df_features['z_w2v_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['question1_w2v'], x['question2_w2v']), axis=1)
df_features['z_w2v_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['question1_w2v'], x['question2_w2v']), axis=1)
df_features['z_q1_skew'] = df_features.question1_w2v.map(lambda x:skew(x))
df_features['z_q2_skew'] = df_features.question2_w2v.map(lambda x:skew(x))
df_features['z_q1_kur'] = df_features.question1_w2v.map(lambda x:kurtosis(x))
df_features['z_q2_kur'] = df_features.question2_w2v.map(lambda x:kurtosis(x))
del df_features['question1_w2v']
del df_features['question2_w2v']
print('all done')
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features.fillna(0.0)
return df_features