functions.py 文件源码-python代码片段

def calculatepRCA(data, y ='',c='',p='',x=''):
    '''
    Returns the pRCA from data. pRCA is the probability that (RCA_{y+1} > 1) given the volume of exports (x_{cpy}),
    and the 'baseline term' (\sum_c x_{cpy}  \sum_p x_{cpy} / \sum_c \sum_p x_{cpy}).
    It is computed using k-nearest neighbors, in the space of log exports and log baseline term.
    Parameters
    ----------
    data : pandas.DataFrame
        Raw data. It has source,target,volume (trade, number of people etc.).
    y,c,p,x : str (optional)
        Labels of the columns in data used for source,target,volume
    Returns
    -------
    RCA : pandas.DataFrame
        Table with the RCAs, with the columns c,p,x,RCA
        If shares is True it also includes:
            s_c : Share of X_cp over X_c
            s_p : Share of X_cp over X_p
    '''
    df = calculateRCA_by_year(data,y ='year',c='ccode',p='pcode',x='x',log_terms = True)

    #Compute (RCA > 1) next year and merge it
    df_ = df.copy()
    df_['year'] = df_['year'] - 1
    df_['RCA_y+1'] = (df_['log(RCA)'] > 0).astype(int)
    df_ = df_[['year','ccode','pcode','RCA_y+1']]
    df = df.merge(df_)

    #Prepare dataset for knn and fit
    M = df[['log(x)','T','RCA_y+1']].as_matrix()
    X, y = M[:,:2], M[:, 2] 
    knn = neighbors.KNeighborsRegressor(n_neighbors = 200, weights = 'uniform').fit(X, y)

    #To avoid memory error, compute predictions in split X. Predictions are output pRCA
    pRCA = np.array([])
    for x in np.array_split(X, 10):
        pRCA = np.append(pRCA, knn.predict(x))
    df['pRCA'] = pRCA

    return df