python类skew()的实例源码

gradient_boosting.py 文件源码 项目:HousePricePredictionKaggle 作者: Nuwantha 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def data_preprocess(train,test):
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx],inplace=True)
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                          test.loc[:,'MSSubClass':'SaleCondition']))

    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data = all_data.drop(to_delete,axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    #log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train,X_test,y
RandomForest.py 文件源码 项目:HousePricePredictionKaggle 作者: Nuwantha 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def data_preprocess(train,test):
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx],inplace=True)
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                          test.loc[:,'MSSubClass':'SaleCondition']))

    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data = all_data.drop(to_delete,axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    #log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train,X_test,y
ensemble_stacking.py 文件源码 项目:HousePricePredictionKaggle 作者: Nuwantha 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def data_preprocess(train, test):
    outlier_idx = [4, 11, 13, 20, 46, 66, 70, 167, 178, 185, 199, 224, 261, 309, 313, 318, 349, 412, 423, 440, 454, 477,
                   478, 523, 540, 581, 588, 595, 654, 688, 691, 774, 798, 875, 898, 926, 970, 987, 1027, 1109, 1169,
                   1182, 1239, 1256, 1298, 1324, 1353, 1359, 1405, 1442, 1447]
    train.drop(train.index[outlier_idx], inplace=True)
    all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
                          test.loc[:, 'MSSubClass':'SaleCondition']))

    to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
    all_data = all_data.drop(to_delete, axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    # log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))  # compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(method='ffill')
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train, X_test, y
calculate_aggregate_statistics.py 文件源码 项目:tbp-next-basket 作者: GiulioRossetti 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def calculate_aggregate(values):
    agg_measures = {
        'avg': np.mean(values),
        'std': np.std(values),
        'var': np.var(values),
        'med': np.median(values),
        '10p': np.percentile(values, 10),
        '25p': np.percentile(values, 25),
        '50p': np.percentile(values, 50),
        '75p': np.percentile(values, 75),
        '90p': np.percentile(values, 90),
        'iqr': np.percentile(values, 75) - np.percentile(values, 25),
        'iqm': interquartile_range_mean(values),
        'mad': mean_absolute_deviation(values),
        'cov': 1.0 * np.mean(values) / np.std(values),
        'gin': gini_coefficient(values),
        'skw': stats.skew(values),
        'kur': stats.kurtosis(values),
        'sum': np.sum(values)
    }

    return agg_measures
calculate_aggregate_statistics.py 文件源码 项目:TX-Means 作者: riccotti 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def calculate_aggregate(values):
    agg_measures = {
        'avg': np.mean(values),
        'std': np.std(values),
        'var': np.var(values),
        'med': np.median(values),
        '10p': np.percentile(values, 10),
        '25p': np.percentile(values, 25),
        '50p': np.percentile(values, 50),
        '75p': np.percentile(values, 75),
        '90p': np.percentile(values, 90),
        'iqr': np.percentile(values, 75) - np.percentile(values, 25),
        'iqm': interquartile_range_mean(values),
        'mad': mean_absolute_deviation(values),
        'cov': 1.0 * np.mean(values) / np.std(values),
        'gin': gini_coefficient(values),
        'skw': stats.skew(values),
        'kur': stats.kurtosis(values)
    }

    return agg_measures
test_analytics.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def test_skew(self):
        tm._skip_if_no_scipy()

        from scipy.stats import skew
        alt = lambda x: skew(x, bias=False)
        self._check_stat_op('skew', alt)

        # test corner cases, skew() returns NaN unless there's at least 3
        # values
        min_N = 3
        for i in range(1, min_N + 1):
            s = Series(np.ones(i))
            df = DataFrame(np.ones((i, i)))
            if i < min_N:
                self.assertTrue(np.isnan(s.skew()))
                self.assertTrue(np.isnan(df.skew()).all())
            else:
                self.assertEqual(0, s.skew())
                self.assertTrue((df.skew() == 0).all())
test_panel.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 23 收藏 0 点赞 0 评论 0
def test_skew(self):
        try:
            from scipy.stats import skew
        except ImportError:
            raise nose.SkipTest("no scipy.stats.skew")

        def this_skew(x):
            if len(x) < 3:
                return np.nan
            return skew(x, bias=False)

        self._check_stat_op('skew', this_skew)

    # def test_mad(self):
    #     f = lambda x: np.abs(x - x.mean()).mean()
    #     self._check_stat_op('mad', f)
test_panel.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 21 收藏 0 点赞 0 评论 0
def test_sem(self):
        def alt(x):
            if len(x) < 2:
                return np.nan
            return np.std(x, ddof=1) / np.sqrt(len(x))

        self._check_stat_op('sem', alt)

    # def test_skew(self):
    #     from scipy.stats import skew

    #     def alt(x):
    #         if len(x) < 3:
    #             return np.nan
    #         return skew(x, bias=False)

    #     self._check_stat_op('skew', alt)
test_panel4d.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def test_sem(self):
        def alt(x):
            if len(x) < 2:
                return np.nan
            return np.std(x, ddof=1) / np.sqrt(len(x))
        self._check_stat_op('sem', alt)

    # def test_skew(self):
    #     from scipy.stats import skew

    #     def alt(x):
    #         if len(x) < 3:
    #             return np.nan
    #         return skew(x, bias=False)

    #     self._check_stat_op('skew', alt)
test_nanops.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def test_returned_dtype(self):

        dtypes = [np.int16, np.int32, np.int64, np.float32, np.float64]
        if hasattr(np, 'float128'):
            dtypes.append(np.float128)

        for dtype in dtypes:
            s = Series(range(10), dtype=dtype)
            group_a = ['mean', 'std', 'var', 'skew', 'kurt']
            group_b = ['min', 'max']
            for method in group_a + group_b:
                result = getattr(s, method)()
                if is_integer_dtype(dtype) and method in group_a:
                    self.assertTrue(
                        result.dtype == np.float64,
                        "return dtype expected from %s is np.float64, "
                        "got %s instead" % (method, result.dtype))
                else:
                    self.assertTrue(
                        result.dtype == dtype,
                        "return dtype expected from %s is %s, "
                        "got %s instead" % (method, dtype, result.dtype))
KA_data_exploration.py 文件源码 项目:Kaggle_Buddy 作者: NickYi1990 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def ka_display_skewnewss(data):
    '''show skewness information

        Parameters
        ----------
        data: pandas dataframe

        Return
        ------
        df: pandas dataframe
    '''
    numeric_cols = data.columns[data.dtypes != 'object'].tolist()
    skew_value = []

    for i in numeric_cols:
        skew_value += [skew(data[i])]
    df = pd.concat(
        [pd.Series(numeric_cols), pd.Series(data.dtypes[data.dtypes != 'object'].apply(lambda x: str(x)).values)
            , pd.Series(skew_value)], axis=1)
    df.columns = ['var_name', 'col_type', 'skew_value']

    return df
mir_make_mfcc_sample.py 文件源码 项目:toho_mir_ml 作者: kodack64 项目源码 文件源码 阅读 105 收藏 0 点赞 0 评论 0
def mfccPostProcess(directory,fileCount):
    for count in range(fileCount):
        print("{0}/{1}".format(count+1,fileCount))
        for mfccext in mfccList:
            mfcc = np.loadtxt(directory+str(count)+mfccext+".csv",delimiter=",")
            dmfcc = librosa.feature.delta(mfcc)
            result = np.zeros((mfcc.shape[1],14))

            result[:,0] = np.mean(mfcc, axis=0)
            result[:,1] = np.var(mfcc, axis=0, dtype=np.float64)
            result[:,2] = stats.skew(mfcc, axis=0)
            result[:,3] = stats.kurtosis(mfcc, axis=0, fisher=False)
            result[:,4] = np.median(mfcc, axis=0)
            result[:,5] = np.min(mfcc, axis=0)
            result[:,6] = np.max(mfcc, axis=0)
            result[:,7] = np.mean(dmfcc, axis=0)
            result[:,8] = np.var(dmfcc, axis=0, dtype=np.float64)
            result[:,9] = stats.skew(dmfcc, axis=0)
            result[:,10] = stats.kurtosis(dmfcc, axis=0, fisher=False)
            result[:,11] = np.median(dmfcc, axis=0)
            result[:,12] = np.min(dmfcc, axis=0)
            result[:,13] = np.max(dmfcc, axis=0)
            result[np.where(np.isnan(result))] = 0
            np.savetxt(directory+str(count)+mfccext+"_stat.txt",result.flatten("F"),delimiter=",")
skew.py 文件源码 项目:scikit-discovery 作者: MITHaystack 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def process(self, obj_data):
        '''
        Apply Skew analysis with results added to the data wrapper

        @param obj_data: Data wrapper
        '''

        column_names = obj_data.getDefaultColumns()

        results = defaultdict(dict)
        # for label, frame in tqdm(obj_data.getIterator()):
        for label, frame in obj_data.getIterator():
            for column in column_names:
                # dropping missing data in order to remove top and bottom 2%
                data = frame[column].dropna()
                # Remove top and bottom 2%
                rem_num = round(len(data)*0.02)
                res = skew(data.sort_values(ascending=True)[rem_num:-rem_num])
                if isinstance(res, np.ma.masked_array):
                    res = np.float(res.data)
                results[label][column] = res
                obj_data.addResult(self.str_description, results)
two_sigma_financial_modelling.py 文件源码 项目:PortfolioTimeSeriesAnalysis 作者: MizioAnd 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def skew_correction(df, numerical_features):
        # Skew correction
        skewed_feats = df[numerical_features].apply(lambda x: skew(x.dropna()))  # compute skewness
        skewed_feats = skewed_feats[skewed_feats > 0.75]
        skewed_feats = skewed_feats.index
        df.loc[:, tuple(skewed_feats)] = np.log1p(np.asarray(df[skewed_feats], dtype=float))
binnings.py 文件源码 项目:physt 作者: janpipek 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def ideal_bin_count(data, method="default"):
    """A theoretically ideal bin count.

    Parameters
    ----------
    data: array_like or None
        Data to work on. Most methods don't use this.
    method: str
        Name of the method to apply, available values:
          - default (~sturges)
          - sqrt
          - sturges
          - doane
          - rice
        See https://en.wikipedia.org/wiki/Histogram for the description

    Returns
    -------
    int
        Number of bins, always >= 1
    """
    n = data.size
    if n < 1:
        return 1
    if method == "default":
        if n <= 32:
            return 7
        else:
            return ideal_bin_count(data, "sturges")
    elif method == "sqrt":
        return int(np.ceil(np.sqrt(n)))
    elif method == "sturges":
        return int(np.ceil(np.log2(n)) + 1)
    elif method == "doane":
        if n < 3:
            return 1
        from scipy.stats import skew
        sigma = np.sqrt(6 * (n-2) / (n + 1) * (n + 3))
        return int(np.ceil(1 + np.log2(n) + np.log2(1 + np.abs(skew(data)) / sigma)))
    elif method == "rice":
        return int(np.ceil(2 * np.power(n, 1 / 3)))
house_prices.py 文件源码 项目:HousePrices 作者: MizioAnd 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def skew_correction(df, numerical_features):
        # Skew correction
        skewed_feats = df[numerical_features].apply(lambda x: skew(x.dropna()))  # compute skewness
        skewed_feats = skewed_feats[skewed_feats > 0.75]
        skewed_feats = skewed_feats.index
        df.loc[:, tuple(skewed_feats)] = np.log1p(np.asarray(df[skewed_feats], dtype=float))
        # df[skewed_feats] = np.log1p(np.asarray(df[skewed_feats], dtype=float))
test_panel4d.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def test_skew(self):
        try:
            from scipy.stats import skew
        except ImportError:
            raise nose.SkipTest("no scipy.stats.skew")

        def this_skew(x):
            if len(x) < 3:
                return np.nan
            return skew(x, bias=False)
        self._check_stat_op('skew', this_skew)

    # def test_mad(self):
    #     f = lambda x: np.abs(x - x.mean()).mean()
    #     self._check_stat_op('mad', f)
test_analytics.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def test_skew(self):
        tm._skip_if_no_scipy()
        from scipy.stats import skew

        def alt(x):
            if len(x) < 3:
                return np.nan
            return skew(x, bias=False)

        self._check_stat_op('skew', alt)
test_analytics.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def test_stats_mixed_type(self):
        # don't blow up
        self.mixed_frame.std(1)
        self.mixed_frame.var(1)
        self.mixed_frame.mean(1)
        self.mixed_frame.skew(1)
test_window.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_how_compat(self):
        # in prior versions, we would allow how to be used in the resample
        # now that its deprecated, we need to handle this in the actual
        # aggregation functions
        s = pd.Series(
            np.random.randn(20),
            index=pd.date_range('1/1/2000', periods=20, freq='12H'))

        for how in ['min', 'max', 'median']:
            for op in ['mean', 'sum', 'std', 'var', 'kurt', 'skew']:
                for t in ['rolling', 'expanding']:

                    with tm.assert_produces_warning(FutureWarning,
                                                    check_stacklevel=False):

                        dfunc = getattr(pd, "{0}_{1}".format(t, op))
                        if dfunc is None:
                            continue

                        if t == 'rolling':
                            kwargs = {'window': 5}
                        else:
                            kwargs = {}
                        result = dfunc(s, freq='D', how=how, **kwargs)

                        expected = getattr(
                            getattr(s, t)(freq='D', **kwargs), op)(how=how)
                        assert_series_equal(result, expected)
test_window.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def test_rolling_skew(self):
        try:
            from scipy.stats import skew
        except ImportError:
            raise nose.SkipTest('no scipy')
        self._check_moment_func(mom.rolling_skew,
                                lambda x: skew(x, bias=False), name='skew')
test_nanops.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def test_nanskew(self):
        tm.skip_if_no_package('scipy.stats')
        tm._skip_if_scipy_0_17()
        from scipy.stats import skew
        func = partial(self._skew_kurt_wrap, func=skew)
        self.check_funs(nanops.nanskew, func, allow_complex=False,
                        allow_str=False, allow_date=False, allow_tdelta=False)
test_nanops.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def setUp(self):
        # Test data + skewness value (computed with scipy.stats.skew)
        self.samples = np.sin(np.linspace(0, 1, 200))
        self.actual_skew = -0.1875895205961754
test_nanops.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def test_constant_series(self):
        # xref GH 11974
        for val in [3075.2, 3075.3, 3075.5]:
            data = val * np.ones(300)
            skew = nanops.nanskew(data)
            self.assertEqual(skew, 0.0)
test_nanops.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def test_ground_truth(self):
        skew = nanops.nanskew(self.samples)
        self.assertAlmostEqual(skew, self.actual_skew)
test_nanops.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 43 收藏 0 点赞 0 评论 0
def test_nans(self):
        samples = np.hstack([self.samples, np.nan])
        skew = nanops.nanskew(samples, skipna=False)
        self.assertTrue(np.isnan(skew))
test_nanops.py 文件源码 项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def test_nans_skipna(self):
        samples = np.hstack([self.samples, np.nan])
        skew = nanops.nanskew(samples, skipna=True)
        tm.assert_almost_equal(skew, self.actual_skew)
features.py 文件源码 项目:stegasawus 作者: rokkuran 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def statistical_metrics(x):
    """
    Calculates statistical metrics on input array (mean, std, skew, kurtosis).
    """

    metrics = {
        'mean': np.mean,
        'stdev': np.std,
        'skew': stats.skew,
        'kurtosis': stats.kurtosis
    }
    return {k: fn(x.flatten()) for k, fn in metrics.items()}
test_gen_usr_distrib.py 文件源码 项目:bmlingam 作者: taku-y 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def test_gen_usr_distrib(n_samples=100000, verbose=False):
    rng  = np.random.RandomState(0)

    xs = _gen_usr_distrib(n_samples, ['laplace'], rng)
    assert_allclose(np.mean(xs), 0, atol=5e-2)
    assert_allclose(np.std(xs), 1, atol=5e-2)
    assert_allclose(skew(xs)[0], 0, atol=5e-2)
    assert_allclose(kurtosis(xs)[0], 3, atol=5e-2)

    xs = _gen_usr_distrib(n_samples, ['exp'], rng)
    assert_allclose(np.std(xs), 1, atol=5e-2)
w2v_distance.py 文件源码 项目:Quora-Kaggle 作者: PPshrimpGo 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def get_features(df_features):
    print('use w2v to document presentation')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge_tfidf(x['question1'], x['question2']), axis = 1)
    print('nones')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
    df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)
    #df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    #df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    df_features['question1_w2v'] = df_features.question1.map(lambda x: get_vector_tfidf(" ".join(x)))
    df_features['question2_w2v'] = df_features.question2.map(lambda x: get_vector_tfidf(" ".join(x)))
    print('z_dist')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_tfidf_cos_sim')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_w2v_nones')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim_tfidf(x['q1_unique'], x['q2_unique']), axis=1)
    df_features['z_w2v_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['question1_w2v'], x['question2_w2v']), axis=1)
    df_features['z_w2v_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['question1_w2v'], x['question2_w2v'],3), axis=1)
    df_features['z_w2v_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['question1_w2v'], x['question2_w2v']), axis=1)
    df_features['z_w2v_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['question1_w2v'], x['question2_w2v']), axis=1)
    df_features['z_q1_skew'] = df_features.question1_w2v.map(lambda x:skew(x))
    df_features['z_q2_skew'] = df_features.question2_w2v.map(lambda x:skew(x))
    df_features['z_q1_kur'] = df_features.question1_w2v.map(lambda x:kurtosis(x))
    df_features['z_q2_kur'] = df_features.question2_w2v.map(lambda x:kurtosis(x))
    del df_features['question1_w2v']
    del df_features['question2_w2v']
    print('all done')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features.fillna(0.0)
    return df_features


问题


面经


文章

微信
公众号

扫码关注公众号