python类kurtosis()的实例源码-面圈网

calculate_aggregate_statistics.py 文件源码项目：tbp-next-basket 作者: GiulioRossetti 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def calculate_aggregate(values):
    agg_measures = {
        'avg': np.mean(values),
        'std': np.std(values),
        'var': np.var(values),
        'med': np.median(values),
        '10p': np.percentile(values, 10),
        '25p': np.percentile(values, 25),
        '50p': np.percentile(values, 50),
        '75p': np.percentile(values, 75),
        '90p': np.percentile(values, 90),
        'iqr': np.percentile(values, 75) - np.percentile(values, 25),
        'iqm': interquartile_range_mean(values),
        'mad': mean_absolute_deviation(values),
        'cov': 1.0 * np.mean(values) / np.std(values),
        'gin': gini_coefficient(values),
        'skw': stats.skew(values),
        'kur': stats.kurtosis(values),
        'sum': np.sum(values)
    }

    return agg_measures

calculate_aggregate_statistics.py 文件源码项目：TX-Means 作者: riccotti 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def calculate_aggregate(values):
    agg_measures = {
        'avg': np.mean(values),
        'std': np.std(values),
        'var': np.var(values),
        'med': np.median(values),
        '10p': np.percentile(values, 10),
        '25p': np.percentile(values, 25),
        '50p': np.percentile(values, 50),
        '75p': np.percentile(values, 75),
        '90p': np.percentile(values, 90),
        'iqr': np.percentile(values, 75) - np.percentile(values, 25),
        'iqm': interquartile_range_mean(values),
        'mad': mean_absolute_deviation(values),
        'cov': 1.0 * np.mean(values) / np.std(values),
        'gin': gini_coefficient(values),
        'skw': stats.skew(values),
        'kur': stats.kurtosis(values)
    }

    return agg_measures

test_analytics.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def test_kurt(self):
        tm._skip_if_no_scipy()

        from scipy.stats import kurtosis
        alt = lambda x: kurtosis(x, bias=False)
        self._check_stat_op('kurt', alt)

        index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
                           labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2],
                                   [0, 1, 0, 1, 0, 1]])
        s = Series(np.random.randn(6), index=index)
        self.assertAlmostEqual(s.kurt(), s.kurt(level=0)['bar'])

        # test corner cases, kurt() returns NaN unless there's at least 4
        # values
        min_N = 4
        for i in range(1, min_N + 1):
            s = Series(np.ones(i))
            df = DataFrame(np.ones((i, i)))
            if i < min_N:
                self.assertTrue(np.isnan(s.kurt()))
                self.assertTrue(np.isnan(df.kurt()).all())
            else:
                self.assertEqual(0, s.kurt())
                self.assertTrue((df.kurt() == 0).all())

test_analytics.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def test_kurt(self):
        tm._skip_if_no_scipy()

        from scipy.stats import kurtosis

        def alt(x):
            if len(x) < 4:
                return np.nan
            return kurtosis(x, bias=False)

        self._check_stat_op('kurt', alt)

        index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
                           labels=[[0, 0, 0, 0, 0, 0],
                                   [0, 1, 2, 0, 1, 2],
                                   [0, 1, 0, 1, 0, 1]])
        df = DataFrame(np.random.randn(6, 3), index=index)

        kurt = df.kurt()
        kurt2 = df.kurt(level=0).xs('bar')
        assert_series_equal(kurt, kurt2, check_names=False)
        self.assertTrue(kurt.name is None)
        self.assertEqual(kurt2.name, 'bar')

data_tripadvisor.py 文件源码项目：KDEm 作者: MengtingWan 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def plot_hist(item, figure_id=1):
    pt.figure(figure_id)
    kurtosis = -np.ones(8)
    for i in range(item.shape[1]):
        pt.subplot(240+i)
        tmp = item[item[:,i]!=-1,i]
        tmp = tmp + np.random.rand(len(tmp)) - 0.5
        pt.hist(tmp, bins=6, normed=True, range=(0.9,6.1), alpha=0.8, color=colorc[i])
        pt.title(name[i])
        density = kde.gaussian_kde(tmp)
        xgrid = np.linspace(0, 6, 100)
        pt.plot(xgrid, density(xgrid), 'r-')
        avg = np.mean(tmp)
        sd = np.std(tmp)
        pt.plot(xgrid, normpdf(xgrid,avg,sd))
        pt.show()
        kurtosis[i] = sps.kurtosis(item[item[:,i]!=-1,i])
    return(kurtosis)

cf_kt.py 文件源码项目：PhasePApy 作者: austinholland 项目源码文件源码阅读 20 收藏 0 点赞 0 评论 0

def _statistics(self):
    data = self.tr.data
    t = np.arange(0, self.delta * self.npts, self.delta)
    m = len(data)
    Nsta = int(self.t_win * self.sampling_rate)

    # compute the short time average (STA)
    kt = np.zeros(m, dtype='float64')
    pad_kt = np.zeros(Nsta)
    # Tricky: Construct a big window of length len(a)-nsta. Now move this
    # window nsta points, i.e. the window "sees" every point in a at least
    # once.
    # Changed xrange to range as it is compatible in both python 2 & 3
    for i in range(m):  # window size to smooth over
        kt[i] = abs(kurtosis(data[i-Nsta:i]))

    kt[0:Nsta] = 0

    return kt

mir_make_mfcc_sample.py 文件源码项目：toho_mir_ml 作者: kodack64 项目源码文件源码阅读 25 收藏 0 点赞 0 评论 0

def mfccPostProcess(directory,fileCount):
    for count in range(fileCount):
        print("{0}/{1}".format(count+1,fileCount))
        for mfccext in mfccList:
            mfcc = np.loadtxt(directory+str(count)+mfccext+".csv",delimiter=",")
            dmfcc = librosa.feature.delta(mfcc)
            result = np.zeros((mfcc.shape[1],14))

            result[:,0] = np.mean(mfcc, axis=0)
            result[:,1] = np.var(mfcc, axis=0, dtype=np.float64)
            result[:,2] = stats.skew(mfcc, axis=0)
            result[:,3] = stats.kurtosis(mfcc, axis=0, fisher=False)
            result[:,4] = np.median(mfcc, axis=0)
            result[:,5] = np.min(mfcc, axis=0)
            result[:,6] = np.max(mfcc, axis=0)
            result[:,7] = np.mean(dmfcc, axis=0)
            result[:,8] = np.var(dmfcc, axis=0, dtype=np.float64)
            result[:,9] = stats.skew(dmfcc, axis=0)
            result[:,10] = stats.kurtosis(dmfcc, axis=0, fisher=False)
            result[:,11] = np.median(dmfcc, axis=0)
            result[:,12] = np.min(dmfcc, axis=0)
            result[:,13] = np.max(dmfcc, axis=0)
            result[np.where(np.isnan(result))] = 0
            np.savetxt(directory+str(count)+mfccext+"_stat.txt",result.flatten("F"),delimiter=",")

tools.py 文件源码项目：AutoSleepScorerDev 作者: skjerns 项目源码文件源码阅读 28 收藏 0 点赞 0 评论 0

def feat_eeg(signals):
    """
    calculate the relative power as defined by Leangkvist (2012),
    assuming signal is recorded with 100hz
    """
    if signals.ndim == 1: signals = np.expand_dims(signals,0)

    sfreq = use_sfreq
    nsamp = float(signals.shape[1])
    feats = np.zeros((signals.shape[0],9),dtype='float32')
    # 5 FEATURE for freq babnds
    w = (fft(signals,axis=1)).real
    delta = np.sum(np.abs(w[:,np.arange(0.5*nsamp/sfreq,4*nsamp/sfreq, dtype=int)]),axis=1)
    theta = np.sum(np.abs(w[:,np.arange(4*nsamp/sfreq,8*nsamp/sfreq, dtype=int)]),axis=1)
    alpha = np.sum(np.abs(w[:,np.arange(8*nsamp/sfreq,13*nsamp/sfreq, dtype=int)]),axis=1)
    beta  = np.sum(np.abs(w[:,np.arange(13*nsamp/sfreq,20*nsamp/sfreq, dtype=int)]),axis=1)
    gamma = np.sum(np.abs(w[:,np.arange(20*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),axis=1)   # only until 50, because hz=100
    spindle = np.sum(np.abs(w[:,np.arange(12*nsamp/sfreq,14*nsamp/sfreq, dtype=int)]),axis=1)
    sum_abs_pow = delta + theta + alpha + beta + gamma + spindle
    feats[:,0] = delta /sum_abs_pow
    feats[:,1] = theta /sum_abs_pow
    feats[:,2] = alpha /sum_abs_pow
    feats[:,3] = beta  /sum_abs_pow
    feats[:,4] = gamma /sum_abs_pow
    feats[:,5] = spindle /sum_abs_pow
    feats[:,6] = np.log10(stats.kurtosis(signals, fisher=False, axis=1))        # kurtosis
    feats[:,7] = np.log10(-np.sum([(x/nsamp)*(np.log(x/nsamp)) for x in np.apply_along_axis(lambda x: np.histogram(x, bins=8)[0], 1, signals)],axis=1))  # entropy.. yay, one line...
    #feats[:,7] = np.polynomial.polynomial.polyfit(np.log(f[np.arange(0.5*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]), np.log(w[0,np.arange(0.5*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),1)
    feats[:,8] = np.dot(np.array([3.5,4,5,7,30]),feats[:,0:5].T ) / (sfreq/2-0.5)
    if np.any(feats==np.nan): print('NaN detected')
    return np.nan_to_num(feats)

tools.py 文件源码项目：AutoSleepScorerDev 作者: skjerns 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def feat_wavelet(signals):
    """
    calculate the relative power as defined by Leangkvist (2012),
    assuming signal is recorded with 100hz
    """
    if signals.ndim == 1: signals = np.expand_dims(signals,0)

    sfreq = use_sfreq
    nsamp = float(signals.shape[1])
    feats = np.zeros((signals.shape[0],8),dtype='float32')
    # 5 FEATURE for freq babnds
    w = (fft(signals,axis=1)).real
    delta = np.sum(np.abs(w[:,np.arange(0.5*nsamp/sfreq,4*nsamp/sfreq, dtype=int)]),axis=1)
    theta = np.sum(np.abs(w[:,np.arange(4*nsamp/sfreq,8*nsamp/sfreq, dtype=int)]),axis=1)
    alpha = np.sum(np.abs(w[:,np.arange(8*nsamp/sfreq,13*nsamp/sfreq, dtype=int)]),axis=1)
    beta  = np.sum(np.abs(w[:,np.arange(13*nsamp/sfreq,20*nsamp/sfreq, dtype=int)]),axis=1)
    gamma = np.sum(np.abs(w[:,np.arange(20*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),axis=1)   # only until 50, because hz=100
    sum_abs_pow = delta + theta + alpha + beta + gamma
    feats[:,0] = delta /sum_abs_pow
    feats[:,1] = theta /sum_abs_pow
    feats[:,2] = alpha /sum_abs_pow
    feats[:,3] = beta  /sum_abs_pow
    feats[:,4] = gamma /sum_abs_pow
    feats[:,5] = np.log10(stats.kurtosis(signals,fisher=False,axis=1))        # kurtosis
    feats[:,6] = np.log10(-np.sum([(x/nsamp)*(np.log(x/nsamp)) for x in np.apply_along_axis(lambda x: np.histogram(x, bins=8)[0], 1, signals)],axis=1))  # entropy.. yay, one line...
    #feats[:,7] = np.polynomial.polynomial.polyfit(np.log(f[np.arange(0.5*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]), np.log(w[0,np.arange(0.5*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),1)
    feats[:,7] = np.dot(np.array([3.5,4,5,7,30]),feats[:,0:5].T ) / (sfreq/2-0.5)
    if np.any(feats==np.nan): print('NaN detected')

    return np.nan_to_num(feats)

tools.py 文件源码项目：AutoSleepScorerDev 作者: skjerns 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def feat_eog(signals):
    """
    calculate the EOG features
    :param signals: 1D or 2D signals
    """

    if signals.ndim == 1: signals = np.expand_dims(signals,0)
    sfreq = use_sfreq
    nsamp = float(signals.shape[1])
    w = (fft(signals,axis=1)).real   
    feats = np.zeros((signals.shape[0],15),dtype='float32')
    delta = np.sum(np.abs(w[:,np.arange(0.5*nsamp/sfreq,4*nsamp/sfreq, dtype=int)]),axis=1)
    theta = np.sum(np.abs(w[:,np.arange(4*nsamp/sfreq,8*nsamp/sfreq, dtype=int)]),axis=1)
    alpha = np.sum(np.abs(w[:,np.arange(8*nsamp/sfreq,13*nsamp/sfreq, dtype=int)]),axis=1)
    beta  = np.sum(np.abs(w[:,np.arange(13*nsamp/sfreq,20*nsamp/sfreq, dtype=int)]),axis=1)
    gamma = np.sum(np.abs(w[:,np.arange(20*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),axis=1)   # only until 50, because hz=100
    sum_abs_pow = delta + theta + alpha + beta + gamma
    feats[:,0] = delta /sum_abs_pow
    feats[:,1] = theta /sum_abs_pow
    feats[:,2] = alpha /sum_abs_pow
    feats[:,3] = beta  /sum_abs_pow
    feats[:,4] = gamma /sum_abs_pow
    feats[:,5] = np.dot(np.array([3.5,4,5,7,30]),feats[:,0:5].T ) / (sfreq/2-0.5) #smean
    feats[:,6] = np.sqrt(np.max(signals, axis=1))    #PAV
    feats[:,7] = np.sqrt(np.abs(np.min(signals, axis=1)))   #VAV   
    feats[:,8] = np.argmax(signals, axis=1)/nsamp #PAP
    feats[:,9] = np.argmin(signals, axis=1)/nsamp #VAP
    feats[:,10] = np.sqrt(np.sum(np.abs(signals), axis=1)/ np.mean(np.sum(np.abs(signals), axis=1))) # AUC
    feats[:,11] = np.sum(((np.roll(np.sign(signals), 1,axis=1) - np.sign(signals)) != 0).astype(int),axis=1)/nsamp #TVC
    feats[:,12] = np.log10(np.std(signals, axis=1)) #STD/VAR
    feats[:,13] = np.log10(stats.kurtosis(signals,fisher=False,axis=1))       # kurtosis
    feats[:,14] = np.log10(-np.sum([(x/nsamp)*((np.log((x+np.spacing(1))/nsamp))) for x in np.apply_along_axis(lambda x: np.histogram(x, bins=8)[0], 1, signals)],axis=1))  # entropy.. yay, one line...
    if np.any(feats==np.nan): print('NaN detected')
    return np.nan_to_num(feats)

tools.py 文件源码项目：AutoSleepScorerDev 作者: skjerns 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def feat_emg(signals):
    """
    calculate the EMG median as defined by Leangkvist (2012),
    """
    if signals.ndim == 1: signals = np.expand_dims(signals,0)
    sfreq = use_sfreq
    nsamp = float(signals.shape[1])
    w = (fft(signals,axis=1)).real   
    feats = np.zeros((signals.shape[0],13),dtype='float32')
    delta = np.sum(np.abs(w[:,np.arange(0.5*nsamp/sfreq,4*nsamp/sfreq, dtype=int)]),axis=1)
    theta = np.sum(np.abs(w[:,np.arange(4*nsamp/sfreq,8*nsamp/sfreq, dtype=int)]),axis=1)
    alpha = np.sum(np.abs(w[:,np.arange(8*nsamp/sfreq,13*nsamp/sfreq, dtype=int)]),axis=1)
    beta  = np.sum(np.abs(w[:,np.arange(13*nsamp/sfreq,20*nsamp/sfreq, dtype=int)]),axis=1)
    gamma = np.sum(np.abs(w[:,np.arange(20*nsamp/sfreq,50*nsamp/sfreq, dtype=int)]),axis=1)   # only until 50, because hz=100
    sum_abs_pow = delta + theta + alpha + beta + gamma
    feats[:,0] = delta /sum_abs_pow
    feats[:,1] = theta /sum_abs_pow
    feats[:,2] = alpha /sum_abs_pow
    feats[:,3] = beta  /sum_abs_pow
    feats[:,4] = gamma /sum_abs_pow
    feats[:,5] = np.dot(np.array([3.5,4,5,7,30]),feats[:,0:5].T ) / (sfreq/2-0.5) #smean
    emg = np.sum(np.abs(w[:,np.arange(12.5*nsamp/sfreq,32*nsamp/sfreq, dtype=int)]),axis=1)
    feats[:,6] = emg / np.sum(np.abs(w[:,np.arange(8*nsamp/sfreq,32*nsamp/sfreq, dtype=int)]),axis=1)  # ratio of high freq to total motor
    feats[:,7] = np.median(np.abs(w[:,np.arange(8*nsamp/sfreq,32*nsamp/sfreq, dtype=int)]),axis=1)    # median freq
    feats[:,8] = np.mean(np.abs(w[:,np.arange(8*nsamp/sfreq,32*nsamp/sfreq, dtype=int)]),axis=1)   #  mean freq
    feats[:,9] = np.std(signals, axis=1)    #  std 
    feats[:,10] = np.mean(signals,axis=1)
    feats[:,11] = np.log10(stats.kurtosis(signals,fisher=False,axis=1) )
    feats[:,12] = np.log10(-np.sum([(x/nsamp)*((np.log((x+np.spacing(1))/nsamp))) for x in np.apply_along_axis(lambda x: np.histogram(x, bins=8)[0], 1, signals)],axis=1))  # entropy.. yay, one line...
    if np.any(feats==np.nan): print('NaN detected')

    return np.nan_to_num(feats)

ica.py 文件源码项目：cebl 作者: idfah 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def __init__(self, s, lags=0, kurtosis='adapt',
                 learningRate=1.5, tolerance=1.0e-6, maxIter=10000,
                 callback=None, verbose=False, *args, **kwargs):
        STrans.__init__(self, s, lags=lags, *args, **kwargs)

        self.train(s, kurtosis=kurtosis,
                   learningRate=learningRate,
                   tolerance=tolerance, maxIter=maxIter,
                   callback=callback, verbose=verbose)

ica.py 文件源码项目：cebl 作者: idfah 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def demoICA():
    t = np.linspace(0.0, 30*np.pi, 1000)

    s1 = spsig.sawtooth(t)
    s2 = np.cos(5.0*t)
    s3 = np.random.uniform(-1.0, 1.0, size=t.size)
    s = np.vstack((s1,s2,s3)).T

    m = np.random.random((3,3))
    m /= m.sum(axis=0)

    sMixed = s.dot(m)

    icaFilt = ICA(sMixed, kurtosis='sub', verbose=True)

    fig = plt.figure()

    axOrig = fig.add_subplot(4,1, 1)
    axOrig.plot(s+util.colsep(s))
    axOrig.set_title('Unmixed Signal')
    axOrig.autoscale(tight=True)

    axMixed = fig.add_subplot(4,1, 2)
    axMixed.plot(sMixed+util.colsep(sMixed))
    axMixed.set_title('Mixed Signal (random transform)')
    axMixed.autoscale(tight=True)

    axUnmixed = fig.add_subplot(4,1, 3)
    icaFilt.plotTransform(sMixed, ax=axUnmixed)
    axUnmixed.set_title('ICA Components')
    axUnmixed.autoscale(tight=True)

    axCleaned = fig.add_subplot(4,1, 4)
    icaFilt.plotFilter(sMixed, comp=(0,1,), ax=axCleaned)
    axCleaned.set_title('Cleaned Signal (First two components kept)')
    axCleaned.autoscale(tight=True)

    fig.tight_layout()

test_window.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def test_rolling_kurt(self):
        try:
            from scipy.stats import kurtosis
        except ImportError:
            raise nose.SkipTest('no scipy')
        self._check_moment_func(mom.rolling_kurt,
                                lambda x: kurtosis(x, bias=False), name='kurt')

test_nanops.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 33 收藏 0 点赞 0 评论 0

def test_nankurt(self):
        tm.skip_if_no_package('scipy.stats')
        tm._skip_if_scipy_0_17()
        from scipy.stats import kurtosis
        func1 = partial(kurtosis, fisher=True)
        func = partial(self._skew_kurt_wrap, func=func1)
        self.check_funs(nanops.nankurt, func, allow_complex=False,
                        allow_str=False, allow_date=False, allow_tdelta=False)

test_nanops.py 文件源码项目：PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda 作者: SignalMedia 项目源码文件源码阅读 23 收藏 0 点赞 0 评论 0

def setUp(self):
        # Test data + kurtosis value (computed with scipy.stats.kurtosis)
        self.samples = np.sin(np.linspace(0, 1, 200))
        self.actual_kurt = -1.2058303433799713

features.py 文件源码项目：stegasawus 作者: rokkuran 项目源码文件源码阅读 31 收藏 0 点赞 0 评论 0

def statistical_metrics(x):
    """
    Calculates statistical metrics on input array (mean, std, skew, kurtosis).
    """

    metrics = {
        'mean': np.mean,
        'stdev': np.std,
        'skew': stats.skew,
        'kurtosis': stats.kurtosis
    }
    return {k: fn(x.flatten()) for k, fn in metrics.items()}

test_gen_usr_distrib.py 文件源码项目：bmlingam 作者: taku-y 项目源码文件源码阅读 53 收藏 0 点赞 0 评论 0

def test_gen_usr_distrib(n_samples=100000, verbose=False):
    rng  = np.random.RandomState(0)

    xs = _gen_usr_distrib(n_samples, ['laplace'], rng)
    assert_allclose(np.mean(xs), 0, atol=5e-2)
    assert_allclose(np.std(xs), 1, atol=5e-2)
    assert_allclose(skew(xs)[0], 0, atol=5e-2)
    assert_allclose(kurtosis(xs)[0], 3, atol=5e-2)

    xs = _gen_usr_distrib(n_samples, ['exp'], rng)
    assert_allclose(np.std(xs), 1, atol=5e-2)

w2v_distance.py 文件源码项目：Quora-Kaggle 作者: PPshrimpGo 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def get_features(df_features):
    print('use w2v to document presentation')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
    df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge_tfidf(x['question1'], x['question2']), axis = 1)
    print('nones')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
    df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)
    #df_features['question1_nouns'] = df_features.question1.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    #df_features['question2_nouns'] = df_features.question2.map(lambda x: [w for w, t in nltk.pos_tag(nltk.word_tokenize(str(x).lower())) if t[:1] in ['N']])
    df_features['question1_w2v'] = df_features.question1.map(lambda x: get_vector_tfidf(" ".join(x)))
    df_features['question2_w2v'] = df_features.question2.map(lambda x: get_vector_tfidf(" ".join(x)))
    print('z_dist')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_tfidf_cos_sim')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_w2v_nones')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim_tfidf(x['q1_unique'], x['q2_unique']), axis=1)
    df_features['z_w2v_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['question1_w2v'], x['question2_w2v']), axis=1)
    df_features['z_w2v_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['question1_w2v'], x['question2_w2v'],3), axis=1)
    df_features['z_w2v_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['question1_w2v'], x['question2_w2v']), axis=1)
    df_features['z_w2v_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['question1_w2v'], x['question2_w2v']), axis=1)
    df_features['z_q1_skew'] = df_features.question1_w2v.map(lambda x:skew(x))
    df_features['z_q2_skew'] = df_features.question2_w2v.map(lambda x:skew(x))
    df_features['z_q1_kur'] = df_features.question1_w2v.map(lambda x:kurtosis(x))
    df_features['z_q2_kur'] = df_features.question2_w2v.map(lambda x:kurtosis(x))
    del df_features['question1_w2v']
    del df_features['question2_w2v']
    print('all done')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features.fillna(0.0)
    return df_features

image_grid.py 文件源码项目：orange3-imageanalytics 作者: biolab 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def _get_grid_size(data, use_default_square=False):
        """
        Calculate the size of the grid.

        Parameters
        ----------
        data: array-like
            The normalized data.
        use_default_square: bool
            Define the grid as the minimal possible square.

        Returns
        -------
        int, int
            The width and height of the grid.

        """

        # if the grid would be square, this is the minimum size
        sqr_size = int(np.ceil(np.sqrt(len(data))))
        size_x = size_y = sqr_size

        if not use_default_square:
            kurt = kurtosis(data)
            kurt_x, kurt_y = np.int32(np.abs(np.ceil(kurt * 2)))
            size_x += kurt_x
            size_y += kurt_y

        return size_x, size_y

glove.py 文件源码项目：kaggle-quora-question-pairs 作者: stys 项目源码文件源码阅读 39 收藏 0 点赞 0 评论 0

def features(self, q1, q2):
        q1 = str(q1).lower().split()
        q2 = str(q2).lower().split()
        q1 = [w for w in q1 if w not in stopwords]
        q2 = [w for w in q2 if w not in stopwords]

        wmd = min(self.model.wmdistance(q1, q2), 10)

        q1vec = self.sent2vec(q1)
        q2vec = self.sent2vec(q2)

        if q1vec is not None and q2vec is not None:
            cos = cosine(q1vec, q2vec)
            city = cityblock(q1vec, q2vec)
            jacc = jaccard(q1vec, q2vec)
            canb = canberra(q1vec, q2vec)
            eucl = euclidean(q1vec, q2vec)
            mink = minkowski(q1vec, q2vec, 3)
            bray = braycurtis(q1vec, q2vec)

            q1_skew = skew(q1vec)
            q2_skew = skew(q2vec)
            q1_kurt = kurtosis(q1vec)
            q2_kurt = kurtosis(q2vec)

        else:
            cos = -1
            city = -1
            jacc = -1
            canb = -1
            eucl = -1
            mink = -1
            bray = -1

            q1_skew = 0
            q2_skew = 0
            q1_kurt = 0
            q2_kurt = 0

        return wmd, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt

word2vec.py 文件源码项目：kaggle-quora-question-pairs 作者: stys 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def features(self, q1, q2):
        q1 = str(q1).lower().split()
        q2 = str(q2).lower().split()
        q1 = [w for w in q1 if w not in stopwords]
        q2 = [w for w in q2 if w not in stopwords]

        wmd = min(self.model.wmdistance(q1, q2), 10)
        wmd_norm = min(self.model_norm.wmdistance(q1, q2), 10)

        q1vec = self.sent2vec(q1)
        q2vec = self.sent2vec(q2)

        if q1vec is not None and q2vec is not None:
            cos = cosine(q1vec, q2vec)
            city = cityblock(q1vec, q2vec)
            jacc = jaccard(q1vec, q2vec)
            canb = canberra(q1vec, q2vec)
            eucl = euclidean(q1vec, q2vec)
            mink = minkowski(q1vec, q2vec, 3)
            bray = braycurtis(q1vec, q2vec)

            q1_skew = skew(q1vec)
            q2_skew = skew(q2vec)
            q1_kurt = kurtosis(q1vec)
            q2_kurt = kurtosis(q2vec)

        else:
            cos = -1
            city = -1
            jacc = -1
            canb = -1
            eucl = -1
            mink = -1
            bray = -1

            q1_skew = 0
            q2_skew = 0
            q1_kurt = 0
            q2_kurt = 0

        return wmd, wmd_norm, cos, city, jacc, canb, eucl, mink, bray, q1_skew, q2_skew, q1_kurt, q2_kurt

features.py 文件源码项目：astrobase 作者: waqasbhatti 项目源码文件源码阅读 24 收藏 0 点赞 0 评论 0

def lightcurve_moments(ftimes, fmags, ferrs):
    '''This calculates the weighted mean, stdev, median, MAD, percentiles, skew,
    kurtosis, fraction of LC beyond 1-stdev, and IQR.

    '''

    ndet = len(fmags)

    if ndet > 9:

        # now calculate the various things we need
        series_median = npmedian(fmags)
        series_wmean = (
            npsum(fmags*(1.0/(ferrs*ferrs)))/npsum(1.0/(ferrs*ferrs))
        )
        series_mad = npmedian(npabs(fmags - series_median))
        series_stdev = 1.483*series_mad
        series_skew = spskew(fmags)
        series_kurtosis = spkurtosis(fmags)

        # get the beyond1std fraction
        series_above1std = len(fmags[fmags > (series_median + series_stdev)])
        series_below1std = len(fmags[fmags < (series_median - series_stdev)])

        # this is the fraction beyond 1 stdev
        series_beyond1std = (series_above1std + series_below1std)/float(ndet)

        # get the magnitude percentiles
        series_mag_percentiles = nppercentile(
            fmags,
            [5.0,10,17.5,25,32.5,40,60,67.5,75,82.5,90,95]
        )

        return {
            'median':series_median,
            'wmean':series_wmean,
            'mad':series_mad,
            'stdev':series_stdev,
            'skew':series_skew,
            'kurtosis':series_kurtosis,
            'beyond1std':series_beyond1std,
            'mag_percentiles':series_mag_percentiles,
            'mag_iqr': series_mag_percentiles[8] - series_mag_percentiles[3],
        }


    else:
        LOGERROR('not enough detections in this magseries '
                 'to calculate light curve moments')
        return None

features.py 文件源码项目：AlphaPy 作者: ScottFreeLLC 项目源码文件源码阅读 29 收藏 0 点赞 0 评论 0

def create_scipy_features(base_features, sentinel):
    r"""Calculate the skew, kurtosis, and other statistical features
    for each row.

    Parameters
    ----------
    base_features : numpy array
        The feature dataframe.
    sentinel : float
        The number to be imputed for NaN values.

    Returns
    -------
    sp_features : numpy array
        The calculated SciPy features.

    """

    logger.info("Creating SciPy Features")

    # Generate scipy features

    logger.info("SciPy Feature: geometric mean")
    row_gmean = sps.gmean(base_features, axis=1)
    logger.info("SciPy Feature: kurtosis")
    row_kurtosis = sps.kurtosis(base_features, axis=1)
    logger.info("SciPy Feature: kurtosis test")
    row_ktest, pvalue = sps.kurtosistest(base_features, axis=1)
    logger.info("SciPy Feature: normal test")
    row_normal, pvalue = sps.normaltest(base_features, axis=1)
    logger.info("SciPy Feature: skew")
    row_skew = sps.skew(base_features, axis=1)
    logger.info("SciPy Feature: skew test")
    row_stest, pvalue = sps.skewtest(base_features, axis=1)
    logger.info("SciPy Feature: variation")
    row_var = sps.variation(base_features, axis=1)
    logger.info("SciPy Feature: signal-to-noise ratio")
    row_stn = sps.signaltonoise(base_features, axis=1)
    logger.info("SciPy Feature: standard error of mean")
    row_sem = sps.sem(base_features, axis=1)

    sp_features = np.column_stack((row_gmean, row_kurtosis, row_ktest,
                                   row_normal, row_skew, row_stest,
                                   row_var, row_stn, row_sem))
    sp_features = impute_values(sp_features, 'float64', sentinel)
    sp_features = StandardScaler().fit_transform(sp_features)

    # Return new SciPy features

    logger.info("SciPy Feature Count : %d", sp_features.shape[1])
    return sp_features


#
# Function create_clusters
#

ica.py 文件源码项目：cebl 作者: idfah 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def train(self, s, kurtosis, learningRate, tolerance, maxIter, callback, verbose):
        s = self.prep(s)

        wPrev = np.empty(self.w.shape)
        grad = np.empty((self.nComp, self.nComp))

        I = np.eye(self.nComp, dtype=self.dtype)
        n = 1.0/s.shape[0]

        iteration = 0
        while True:
            y = s.dot(self.w)

            if kurtosis == 'sub':
                k = -1
            elif kurtosis == 'super':
                k = 1
            elif kurtosis == 'adapt':
                #k = np.sign(np.mean(1.0-util.fastTanh(y)**2, axis=0) *
                #            np.mean(y**2, axis=0) -
                #            np.mean(y*util.fastTanh(y), axis=0))

                k = np.sign(spstat.kurtosis(y, axis=0))
                k[np.isclose(k,0.0)] = -1.0

            grad[...] = (I - k*util.fastTanh(y).T.dot(y) - y.T.dot(y)).T.dot(self.w) * n

            wPrev[...] = self.w
            self.w += learningRate * grad

            wtol = np.max(np.abs(wPrev-self.w))

            if verbose:
                print '%d %6f' % (iteration, wtol)

            if callback is not None:
                callback(iteration, wtol)

            if wtol < tolerance:
                self.reason = 'tolerance'
                break

            elif np.max(np.abs(self.w)) > 1.0e100:
                self.reason = 'diverge'
                break

            if iteration >= maxIter:
                self.reason = 'maxiter'
                break

            iteration += 1

        if verbose:
            print 'Reason: ' + self.reason

        self.w /= np.sqrt(np.sum(self.w**2, axis=0))
        self.wInv[...] = np.linalg.pinv(self.w)

analysis.py 文件源码项目：rsmtool 作者: EducationalTestingService 项目源码文件源码阅读 26 收藏 0 点赞 0 评论 0

def compute_basic_descriptives(df, selected_features):
    """
    Compute basic descriptive statistics for the columns
    in the given data frame.

    Parameters
    ----------
    df : pandas DataFrame
        Input data frame containing the feature values.
    selected_features : list of str
        List of feature names for which to compute
        the descriptives.

    Returns
    -------
    df_desc : pandas DataFrame
        Data frame containing the descriptives for
        each of the features.
    """

    # select only feature columns
    df_desc = df[selected_features]

    # get the H1 scores
    scores = df['sc1']

    # compute correlations and p-values separately for efficiency
    cor_series = df_desc.apply(lambda s: pearsonr(s, scores))
    cors = cor_series.apply(lambda t: t[0])
    pvalues = cor_series.apply(lambda t: t[1])

    # create a data frame with all the descriptives
    df_output = pd.DataFrame({'mean': df_desc.mean(),
                              'min': df_desc.min(),
                              'max': df_desc.max(),
                              'std. dev.': df_desc.std(),
                              'skewness': df_desc.skew(),
                              'kurtosis': df_desc.apply(lambda s: kurtosis(s, fisher=False)),
                              'Correlation': cors,
                              'p': pvalues,
                              'N': len(df_desc)})

    # reorder the columns to make it look better
    df_output = df_output[['mean', 'std. dev.', 'min', 'max',
                           'skewness', 'kurtosis', 'Correlation',
                           'p', 'N']]

    return df_output

test_unique.py 文件源码项目：Quora-Kaggle 作者: PPshrimpGo 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def get_features(df_features):
    print('use w2v to document presentation')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S')
   #df_features['z_document_dis'] = df_features.apply(lambda x: getDiff_averge(x['question1'], x['question2']), axis = 1)
    print('get_w2v')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features['q1_unique'] = df_features.apply(lambda x: getdiffwords(x['question1'], x['question2']), axis = 1)
    df_features['q2_unique'] = df_features.apply(lambda x: getdiffwords(x['question2'], x['question1']), axis = 1)

    df_features['q1_unique_w2v_weight'] = df_features.q1_unique.map(lambda x: get_vector(" ".join(x)))
    df_features['q2_unique_w2v_weight'] = df_features.q2_unique.map(lambda x: get_vector(" ".join(x)))
    df_features['q1_unique_w2v'] = df_features.q1_unique.map(lambda x: get_weight_vector(" ".join(x)))
    df_features['q2_unique_w2v'] = df_features.q2_unique.map(lambda x: get_weight_vector(" ".join(x)))

    print('z_dist')
    now = datetime.datetime.now()
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    #df_features['z_dist'] = df_features.apply(lambda x:Levenshtein.ratio(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_tfidf_cos_sim')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    #df_features['z_tfidf_cos_sim'] = df_features.apply(lambda x: cos_sim(x['question1'], x['question2']), axis=1)
    now = datetime.datetime.now()
    print('z_w2v_calc')
    print now.strftime('%Y-%m-%d %H:%M:%S') 

    #df_features['z_w2v_unique'] = df_features.apply(lambda x: w2v_cos_sim(x['q1_unique'], x['q2_unique']), axis=1)
    df_features['z_w2v_unique_dis_e_weight'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
    df_features['z_w2v_unique_dis_e'] = df_features.apply(lambda x: spatial.distance.euclidean(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)    

    df_features['z_w2v_unique_dis_mink_w'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight'],3), axis=1)
    df_features['z_w2v_unique_dis_cityblock_w'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)
    df_features['z_w2v_unique_dis_canberra_w'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v_weight'], x['q2_unique_w2v_weight']), axis=1)

    df_features['z_w2v_unique_dis_mink'] = df_features.apply(lambda x: spatial.distance.minkowski(x['q1_unique_w2v'], x['q2_unique_w2v'],3), axis=1)
    df_features['z_w2v_unique_dis_cityblock'] = df_features.apply(lambda x: spatial.distance.cityblock(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)
    df_features['z_w2v_unique_dis_canberra'] = df_features.apply(lambda x: spatial.distance.canberra(x['q1_unique_w2v'], x['q2_unique_w2v']), axis=1)

    df_features['z_q1_unique_skew_w'] = df_features.q1_unique_w2v_weight.map(lambda x:skew(x))
    df_features['z_q2_unique_skew_w'] = df_features.q2_unique_w2v_weight.map(lambda x:skew(x))
    df_features['z_q1_unique_kur_w'] = df_features.q1_unique_w2v_weight.map(lambda x:kurtosis(x))
    df_features['z_q2_unique_kur_w'] = df_features.q2_unique_w2v_weight.map(lambda x:kurtosis(x))


    df_features['z_q1_unique_skew'] = df_features.q1_unique_w2v.map(lambda x:skew(x))
    df_features['z_q2_unique_skew'] = df_features.q2_unique_w2v.map(lambda x:skew(x))
    df_features['z_q1_unique_kur'] = df_features.q1_unique_w2v.map(lambda x:kurtosis(x))
    df_features['z_q2_unique_kur'] = df_features.q2_unique_w2v.map(lambda x:kurtosis(x))
    del df_features['q1_unique_w2v_weight']
    del df_features['q2_unique_w2v_weight']
    del df_features['q1_unique_w2v']
    del df_features['q2_unique_w2v']
    print('all done')
    print now.strftime('%Y-%m-%d %H:%M:%S') 
    df_features.fillna(0.0)
    return df_features

ica.py 文件源码项目：decoding_challenge_cortana_2016_3rd 作者: kingjr 项目源码文件源码阅读 22 收藏 0 点赞 0 评论 0

def _detect_artifacts(ica, raw, start_find, stop_find, ecg_ch, ecg_score_func,
                      ecg_criterion, eog_ch, eog_score_func, eog_criterion,
                      skew_criterion, kurt_criterion, var_criterion,
                      add_nodes):
    """Aux Function"""
    from scipy import stats

    nodes = []
    if ecg_ch is not None:
        nodes += [_ica_node('ECG', ecg_ch, ecg_score_func, ecg_criterion)]

    if eog_ch not in [None, []]:
        if not isinstance(eog_ch, list):
            eog_ch = [eog_ch]
        for idx, ch in enumerate(eog_ch):
            nodes += [_ica_node('EOG %02d' % idx, ch, eog_score_func,
                      eog_criterion)]

    if skew_criterion is not None:
        nodes += [_ica_node('skewness', None, stats.skew, skew_criterion)]

    if kurt_criterion is not None:
        nodes += [_ica_node('kurtosis', None, stats.kurtosis, kurt_criterion)]

    if var_criterion is not None:
        nodes += [_ica_node('variance', None, np.var, var_criterion)]

    if add_nodes is not None:
        nodes.extend(add_nodes)

    for node in nodes:
        scores = ica.score_sources(raw, start=start_find, stop=stop_find,
                                   target=node.target,
                                   score_func=node.score_func)
        if isinstance(node.criterion, float):
            found = list(np.where(np.abs(scores) > node.criterion)[0])
        else:
            found = list(np.atleast_1d(abs(scores).argsort()[node.criterion]))

        case = (len(found), 's' if len(found) > 1 else '', node.name)
        logger.info('    found %s artifact%s by %s' % case)
        ica.exclude += found

    logger.info('Artifact indices found:\n    ' + str(ica.exclude).strip('[]'))
    if len(set(ica.exclude)) != len(ica.exclude):
        logger.info('    Removing duplicate indices...')
        ica.exclude = list(set(ica.exclude))

    logger.info('Ready.')

default_predictor.py 文件源码项目：Default-Credit-Card-Prediction 作者: AlexPnt 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def get_feature_stats(self):

        # #get input feature
        feature_input=self.feature_input.currentText()
        try:
            if feature_input[0]=='X':
                try:
                    feature_index=int("".join(feature_input[1:]))
                    feature_index-=1

                except:
                    QtWidgets.QMessageBox.information(self, "Wrong Format","Please enter a feature name in the format: X%d.")
                    return
            elif "".join(feature_input[0]+feature_input[1])=='LD' or "".join(feature_input[0]+feature_input[1])=='PC':
                try:
                    feature_index=int("".join(feature_input[2:]))
                    feature_index-=1

                except:
                    QtWidgets.QMessageBox.information(self, "Wrong Format","Please enter a feature name in the format: X||LD||PC%d.")
                    return
            else:
                QtWidgets.QMessageBox.information(self, "Wrong Format","Feature names must be in the format: X%d.")
                return
        except:
            QtWidgets.QMessageBox.information(self, "Data Not Found","Please load a dataset first.")
            return


        try:
            max_value=self.X[:,feature_index].max()
            min_value=self.X[:,feature_index].min()
            mean_value=self.X[:,feature_index].mean()
            std_value=self.X[:,feature_index].std()
            var_value=self.X[:,feature_index].var()
            skewness=stats.skew(self.X[:,feature_index])
            kurtosis=stats.kurtosis(self.X[:,feature_index],fisher=True)
            chi2,chi_p_val=chi2_feature_test(self.X,self.y,int(feature_index))
            H_kw,kw_p_val=kw_feature_test(self.X,self.y,int(feature_index))
            info_gain=information_gain(self.X,self.y,int(feature_index))
            gain_rt=gain_ratio(self.X,self.y,int(feature_index))

        except:
            QtWidgets.QMessageBox.information(self, "Wrong Index","Feature Index Out Of Bounds.")
            return

        feature_stats="""Statistics:\n\nMinimum Value: """+str(min_value)\
                      +"""\n\nMaximum Value: """+str(max_value)\
                      +"""\n\nMean: """+str(mean_value)\
                      +"""\n\nStandard Deviation: """+str(std_value)\
                      +"""\n\nVariance: """+str(var_value)\
                      +"""\n\nSkewness: """+str(skewness)\
                      +"""\n\nKurtosis: """+str(kurtosis)\
                      +"""\n\nChi Squared Test: """+str(chi2[0])\
                      +"""\n\nKruskal-Wallis Test:  """+str(H_kw)\
                      +"""\n\nInformation Gain: """+str(info_gain)\
                      +"""\n\nGain Ratio: """+str(gain_rt)

        self.feature_stats.setText(feature_stats)