python类merge()的实例源码

data.py 文件源码 项目:DREAM 作者: LaceyChen17 项目源码 文件源码 阅读 130 收藏 0 点赞 0 评论 0
def __init__(self, up_basket, up_r_basket = None, up_his = None):
        if (up_r_basket is not None) and (up_his is not None):
            self.is_reordered_included = True
        else:
            self.is_reordered_included = False

        up_basket['num_baskets'] = up_basket.basket.apply(len)
        self.user_id = list(up_basket.user_id)
        self.num_baskets = [int(n) for n in list(up_basket.num_baskets)]    
        self.basket = [[[int(p) for p in b]for b in u] for u in list(up_basket.basket)]

        if self.is_reordered_included is True:
            up_basket = pd.merge(up_basket, up_r_basket, on = ['user_id'], how = 'left')
            up_basket = pd.merge(up_basket, up_his, on = ['user_id'], how = 'left')
            self.reorder_basket = [[[int(p) for p in b]for b in u] for u in list(up_basket.reorder_basket)]
            self.history_item = [[[int(p) for p in b]for b in u] for u in list(up_basket.history_items)]
actor.py 文件源码 项目:saapy 作者: ashapochka 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def insert_actor_ids(commit_frame, actor_frame, drop_name_email=True):
    actor_columns = ['author_name', 'author_email',
                     'committer_name', 'committer_email']
    cf = commit_frame[actor_columns]
    af = actor_frame[['name', 'email', 'actor_id']]
    author = pd.merge(
        cf, af, left_on=actor_columns[:2],
        right_on=('name', 'email'),
        how='left')['actor_id']
    committer = pd.merge(
        cf, af, left_on=actor_columns[2:],
        right_on=('name', 'email'),
        how='left')['actor_id']
    commit_frame.insert(3, 'author', author)
    commit_frame.insert(4, 'committer', committer)
    if drop_name_email:
        commit_frame.drop(actor_columns, axis=1, inplace=True)
    return commit_frame
evalx.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 49 收藏 0 点赞 0 评论 0
def eval(name,clip=False,bar=0.9):
    base = pd.read_csv('../input/stage1_solution_filtered.csv')
    base['Class'] = np.argmax(base[['class%d'%i for i in range(1,10)]].values,axis=1)
    sub = pd.read_csv(name)
    #sub = pd.merge(sub,base[['ID','Class']],on="ID",how='right')
    #print(sub.head())
    y = base['Class'].values
    yp = sub[['class%d'%i for i in range(1,10)]].values
    if clip:
        yp = np.clip(yp,(1.0-bar)/8,bar)
        yp = yp/np.sum(yp,axis=1).reshape([yp.shape[0],1])
    print(name,cross_entropy(y,yp),multiclass_log_loss(y,yp))
    for i in range(9):
        y1 = y[y==i]
        yp1 = yp[y==i]
        print(i,y1.shape,cross_entropy(y1,yp1),multiclass_log_loss(y1,yp1))
replace.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def replace(s,n):
    seen = pd.read_csv(s)
    unseen = pd.read_csv(n)
    te = pd.read_csv('../input/stage2_test_variants.csv')
    tr = pd.read_csv('../input/training_variants')
    unseen = pd.merge(unseen,te,on='ID',how='right')
    seen = pd.merge(seen,te,on='ID',how='right')
    mask = seen.Gene.isin(tr.Gene)
    cols = ['class%d'%i for i in range(1,10)]
    seen.loc[~mask,cols] = 0

    mask = unseen.Gene.isin(tr.Gene)
    unseen.loc[mask,cols] = 0

    assert (unseen['ID']==seen['ID']).all()
    seen[cols] = seen[cols] + unseen[cols]

    seen[cols+['ID']].to_csv('mix.csv',index=False)
feature_extract.py 文件源码 项目:powerAI 作者: dreameng28 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def user_info(df):
    grouped = df[[user_id, power_consumption]].groupby([user_id], as_index=False)
    user_power_mean = grouped.mean()
    user_power_median = grouped.median()
    user_power_var = grouped.var()
    user_power_max = grouped.max()
    user_power_min = grouped.min()
    user_power_mean = user_power_mean.rename(columns={power_consumption: 'user_power_mean'})
    user_power_median = user_power_median.rename(columns={power_consumption: 'user_power_median'})
    user_power_var = user_power_var.rename(columns={power_consumption: 'user_power_var'})
    user_power_max = user_power_max.rename(columns={power_consumption: 'user_power_max'})
    user_power_min = user_power_min.rename(columns={power_consumption: 'user_power_min'})
    return pd.merge(user_power_mean, user_power_median).merge(user_power_var).\
        merge(user_power_max).merge(user_power_min)


# ??????????????????????????????
feature_extract.py 文件源码 项目:powerAI 作者: dreameng28 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def user_info_m_p(df):
    date2 = df[record_date].map(lambda x: str2time(x)).max()
    date1 = datetime.datetime(date2.year, date2.month, 1).date()
    grouped = DataView(df).filter_by_record_date2(date1, date2)[[user_id, 'month', power_consumption]].groupby([user_id, 'month'], as_index=False)
    user_power_mean_m = grouped.mean()
    user_power_median_m = grouped.median()
    user_power_var_m = grouped.var()
    user_power_max_m = grouped.max()
    user_power_min_m = grouped.min()
    user_power_mean_m = user_power_mean_m.rename(columns={power_consumption: 'user_power_mean_m_p'})
    user_power_median_m = user_power_median_m.rename(columns={power_consumption: 'user_power_median_m_p'})
    user_power_var_m = user_power_var_m.rename(columns={power_consumption: 'user_power_var_m_p'})
    user_power_max_m = user_power_max_m.rename(columns={power_consumption: 'user_power_max_m_p'})
    user_power_min_m = user_power_min_m.rename(columns={power_consumption: 'user_power_min_m_p'})
    return pd.merge(user_power_mean_m, user_power_median_m).merge(user_power_var_m).\
        merge(user_power_max_m).merge(user_power_min_m).drop('month', axis=1)


# ????????
data_view.py 文件源码 项目:powerAI 作者: dreameng28 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def user_info_m_p(df):
    date2 = df[record_date].map(lambda x: str2time(x)).max()
    date1 = datetime.datetime(date2.year, date2.month, 1).date()
    grouped = DataView(df).filter_by_record_date2(date1, date2)[[user_id, 'month', power_consumption]].groupby([user_id, 'month'], as_index=False)
    user_power_mean_m = grouped.mean()
    user_power_median_m = grouped.median()
    user_power_var_m = grouped.var()
    user_power_max_m = grouped.max()
    user_power_min_m = grouped.min()
    user_power_mean_m = user_power_mean_m.rename(columns={power_consumption: 'user_power_mean_m_p'})
    user_power_median_m = user_power_median_m.rename(columns={power_consumption: 'user_power_median_m_p'})
    user_power_var_m = user_power_var_m.rename(columns={power_consumption: 'user_power_var_m_p'})
    user_power_max_m = user_power_max_m.rename(columns={power_consumption: 'user_power_max_m_p'})
    user_power_min_m = user_power_min_m.rename(columns={power_consumption: 'user_power_min_m_p'})
    return pd.merge(user_power_mean_m, user_power_median_m).merge(user_power_var_m).\
        merge(user_power_max_m).merge(user_power_min_m).drop('month', axis=1)


# ??????????????????????????????
DataIO.py 文件源码 项目:kaggle 作者: RankingAI 项目源码 文件源码 阅读 32 收藏 0 点赞 0 评论 0
def LoadFromTextFile(InputDir):

        ## raw data
        TrainData = pd.read_csv('%s/train_2016_v2.csv' % InputDir, parse_dates=['transactiondate'], header=0)
        TestData = pd.read_csv('%s/sample_submission.csv' % InputDir, header=0)
        TestData['parcelid'] = TestData['ParcelId']
        TestData.drop('ParcelId', axis=1, inplace=True)
        PropertyData = pd.read_csv('%s/properties_2016.csv' % InputDir,header=0)
        for c, dtype in zip(PropertyData.columns, PropertyData.dtypes):
            if dtype == np.float64:
                PropertyData[c] = PropertyData[c].astype(np.float32)

        ## join dynamic data with static data
        TrainData = pd.merge(TrainData, PropertyData, how='left', on='parcelid')
        TestData = pd.merge(TestData, PropertyData, how='left', on='parcelid')

        return TrainData,TestData

    ## class method, save data with pkl format
utils.py 文件源码 项目:SWEETer-Cat 作者: DanielAndreasen 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def planetAndStar(how='inner'):
    """Read the SWEET-Cat and ExoplanetEU databases and merge them.

    Input
    -----
    how : str (default: 'inner')
      How to merge the two DataFrames. See pd.merge for documentation

    Output
    ------
    d : pd.DataFrame
      The DataFrame of merged DataFrame
    c : list
      The columns that can be used for plotting
    """
    df, columns = readSC()
    deu = readExoplanetEU()
    cols = ['stName', 'plMass', 'plRadius', 'period', 'sma', 'eccentricity',
            'inclination', 'discovered', 'dist', 'b',
            'mag_v', 'mag_i', 'mag_j', 'mag_h', 'mag_k', 'plDensity']
    d = pd.merge(df, deu, left_on='Star', right_on='stName', how=how)
    d['radius'] = list(map(stellar_radius, d['mass'], d['logg']))
    d['teq0'] = d.teff * np.sqrt((d.radius*700000)/(2*d.sma*150000000))
    c = columns + cols[1:]
    return d, c
df_merger.py 文件源码 项目:Eskapade 作者: KaveIO 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def initialize(self):
        """ Perform basic checks on provided attributes.
        """

        if isinstance(self.on, str):
            self.on = [self.on]
        elif not isinstance(self.on,list):
            raise Exception('on is not a list of strings. Exit.')

        assert len(self.on) > 0, 'not specified on which keys to merge.'
        assert (self.how == 'inner' or self.how == 'outer' or self.how == 'left' or self.how == 'right'), \
            'how to merge not specified correctly.'
        assert len(self.output_collection), 'output_collection not specified.'

        # add back on to kwargs, so it's picked up by pandas.
        if self.on is not None:
            self.kwargs['on'] = self.on
        if self.how is not None:
            self.kwargs['how'] = self.how

        self.log().info('kwargs passed on to pandas merge function are: %s' % self.kwargs )

        return StatusCode.Success
guide_legend.py 文件源码 项目:plotnine 作者: has2k1 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def merge(self, other):
        """
        Merge overlapped guides

        For example::

            from ggplot import *
            gg = ggplot(aes(x='cut', fill='cut', color='cut'), data=diamonds)
            gg + stat_bin()

        This would create similar guides for fill and color where only
        a single guide would do
        """
        self.key = pd.merge(self.key, other.key)
        duplicated = set(self.override_aes) & set(other.override_aes)
        if duplicated:
            warn("Duplicated override_aes is ignored.")
        self.override_aes.update(other.override_aes)
        for ae in duplicated:
            self.override_aes.pop(ae)
        return self
utils.py 文件源码 项目:plotnine 作者: has2k1 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def cross_join(df1, df2):
    """
    Return a dataframe that is a cross between dataframes
    df1 and df2

    ref: https://github.com/pydata/pandas/issues/5401
    """
    if len(df1) == 0:
        return df2

    if len(df2) == 0:
        return df1

    # Add as lists so that the new index keeps the items in
    # the order that they are added together
    all_columns = pd.Index(list(df1.columns) + list(df2.columns))
    df1['key'] = 1
    df2['key'] = 1
    return pd.merge(df1, df2, on='key').loc[:, all_columns]
enrich.py 文件源码 项目:ceres 作者: dicortazar 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def enrich(self, column, projects):
        """ This method adds a new column named as 'project'
        that contains information about the associated project
        that the event in 'column' belongs to.

        :param column: column with information related to the project
        :type column: string
        :param projects: information about item - project
        :type projects: pandas.DataFrame

        :returns: original data frame with a new column named 'project'
        :rtype: pandas.DataFrame
        """

        if column not in self.data.columns:
            return self.data

        self.data = pandas.merge(self.data, projects, how='left', on=column)

        return self.data
enrich.py 文件源码 项目:ceres 作者: dicortazar 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def enrich(self, columns):
        """ Merges the original dataframe with corresponding entity uuids based
        on the given columns. Also merges other additional information
        associated to uuids provided in the uuids dataframe, if any.

        :param columns: columns to match for merging
        :type column: string array

        :return: original dataframe with at least one new column:
         * uuid: identity unique identifier
        :rtype: pandas.DataFrame
        """

        for column in columns:
            if column not in self.data.columns:
                return self.data

        self.data = pandas.merge(self.data, self.uuids_df, how='left', on=columns)
        self.data = self.data.fillna("notavailable")

        return self.data
base.py 文件源码 项目:slaveo 作者: lamter 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def touch_open_indexes(self):
        """
        ???????????
        :return:
        """
        quo = self.quotation

        assert isinstance(quo, pd.DataFrame)

        if self.open_indexes == self.INDEXES_TYPE_MA:
            # ????
            ma = pd.merge(self.codes[["open_ma", "close"]], quo[["bid1"]], left_index=True, right_index=True)
            # ????? < ??,? ??? > ??
            ma["open_position"] = ma.bid1 > ma.open_ma
            self.codes.open_position = ma.open_position
            if self.debug:
                open_num = ma.open_position.value_counts()[True]
                self.log.debug("%s ????????? " % open_num)
base.py 文件源码 项目:slaveo 作者: lamter 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def get_sell_order(self):
        """
        ???????
        :param assert_balance: ???
        :return:
        """
        codes = self.codes[self.codes.times != self.codes.exc_times]

        codes["change"] = codes.exc_times - codes.times

        sell_codes = codes[codes.change < 0]

        # ????
        sell_codes = pd.merge(sell_codes, self.quotation, left_index=True, right_index=True)

        # ??????, ?????????
        sell_priority_index = sell_codes[sell_codes.exc_times < -1].times.argsort()[::-1]

        # ??
        sell_codes.take(sell_priority_index)

        return sell_codes
redirects_candidates.py 文件源码 项目:wikilinks 作者: trovdimi 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def export_data_unresolved():

    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    db_work_view = db.get_work_view()
    connection = db_work_view._db_connection


    df_clickstream = pn.read_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/2016_08_clickstream_unresolved.tsv', sep='\t', error_bad_lines=False)

    df_clickstream['prev']=df_clickstream['prev'].str.replace('_', ' ')
    df_clickstream['curr']=df_clickstream['curr'].str.replace('_', ' ')
    df_clickstream['curr_unresolved']=df_clickstream['curr_unresolved'].str.replace('_', ' ')


    df_redirects_candidates = pn.read_sql('select * from redirects_candidates_sample', connection)


    sample_unresoleved = pn.merge(df_redirects_candidates, df_clickstream, how='left', left_on= ['source_article_name','target_article_name'], right_on=['prev', 'curr_unresolved'])

    sample_unresoleved['n'].fillna(0, inplace=True)
    sample_unresoleved.to_csv('/home/ddimitrov/data/enwiki201608_unresolved_redirects/data_unresolved.tsv', sep='\t',encoding="utf-8")
weighted_pagerank.py 文件源码 项目:wikilinks 作者: trovdimi 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def pickle_correlations_zeros_january():
    db = MySQLDatabase(DATABASE_HOST, DATABASE_USER, DATABASE_PASSWORD, DATABASE_NAME)
    conn = db._create_connection()

    print 'read'
    df = pd.read_sql('select source_article_id, target_article_id from link_features', conn)
    print 'loaded links'
    df2 = pd.read_sql('select prev_id, curr_id, counts from clickstream_derived_en_201501  where link_type_derived= "internal-link";',  conn)
    print 'loaded counts'
    result = pd.merge(df, df2, how='left', left_on = ['source_article_id', 'target_article_id'], right_on = ['prev_id', 'curr_id'])
    print 'merged counts'
    print result
    article_counts = result.groupby(by=["target_article_id"])['counts'].sum().reset_index()
    article_counts['counts'].fillna(0.0, inplace=True)
    print article_counts
    print 'write to file'
    article_counts[["target_article_id","counts"]].to_csv(TMP+'january_article_counts.tsv', sep='\t', index=False)
BioCompass.py 文件源码 项目:BioCompass 作者: NP-Omix 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def antiSMASH_to_dataFrame(content):
    """ Extract an antiSMASH file as a pandas.DataFrame
    """
    parsed = parse_antiSMASH(content)
    output = pd.DataFrame()
    for cs in parsed['SignificantHits']:
        clusterSubject = parsed['SignificantHits'][cs].copy()
        df = pd.merge(
                pd.DataFrame(clusterSubject['BlastHit']),
                pd.DataFrame(clusterSubject['TableGenes']),
                on='subject_gene', how='outer')

        del(clusterSubject['BlastHit'])
        del(clusterSubject['TableGenes'])

        for v in clusterSubject:
            df[v] = clusterSubject[v]
        output = output.append(df, ignore_index=True)

    return output
fe.py 文件源码 项目:Kesci-ctrip-room-prediction 作者: qiaoguan 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def get_basicroom_feature(data):

    t1=data[['orderid','basicroomid']].drop_duplicates()[['basicroomid']]
    t1['basicroomid_unique_order_cnt']=1
    t1=t1.groupby(['basicroomid']).agg('sum').reset_index()

    t2=data[['orderdate','basicroomid']].drop_duplicates()[['basicroomid']]
    t2['basicroomid_unique_orderdate_cnt']=1
    t2=t2.groupby(['basicroomid']).agg('sum').reset_index()

    t3=data[['uid','basicroomid']].drop_duplicates()[['basicroomid']]
    t3['basicroomid_unique_user_cnt']=1
    t3=t3.groupby(['basicroomid']).agg('sum').reset_index()

    t4=data[['basicroomid','roomid']].drop_duplicates()[['basicroomid']]
    t4['basicroomid_unique_roomid_cnt']=1
    t4=t4.groupby(['basicroomid']).agg('sum').reset_index()

#    basicroom_feature=pd.merge(t,t1,on='basicroomid')
    basicroom_feature=pd.merge(t1,t2,on='basicroomid')
    basicroom_feature=pd.merge(basicroom_feature,t3,on='basicroomid')
    basicroom_feature=pd.merge(basicroom_feature,t4,on='basicroomid')
    return basicroom_feature
preprocessing.py 文件源码 项目:visualizations 作者: ContentMine 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def preprocess(rawdatapath):
    rawfacts = get_raw(os.path.join(rawdatapath, "facts.json"))
    rawmetadata = get_raw(os.path.join(rawdatapath, "metadata.json"))
    parsed_facts = rawfacts.join(pd.DataFrame(rawfacts["_source"].to_dict()).T).drop("_source", axis=1)
    parsed_metadata = rawmetadata.join(pd.DataFrame(rawmetadata["_source"].to_dict()).T).drop("_source", axis=1)
    parsed_metadata.rename(columns={"title":"articleTitle"}, inplace=True)
    clean(parsed_facts)
    clean(parsed_metadata)
    parsed_metadata = parsed_metadata.join(pd.DataFrame(parsed_metadata["journalInfo"].to_dict()).T).drop("journalInfo", axis=1)
    clean(parsed_metadata)
    parsed_metadata = parsed_metadata.join(pd.DataFrame(parsed_metadata["journal"].to_dict()).T).drop("journal", axis=1)
    clean(parsed_metadata)
    df = pd.merge(parsed_facts, parsed_metadata, how="inner", on="cprojectID", suffixes=('_fact', '_meta'))
    df.rename(columns={"title":"journalTitle"}, inplace=True)
    df["sourcedict"] = get_dictionary(df)
    df["term"] = df["term"].map(str.lower)
    df["wikidataID"] = get_wikidataIDs(df)
    df.drop_duplicates("_id_fact", inplace=True)
    return df
classifying.py 文件源码 项目:TuShare 作者: andyzsf 项目源码 文件源码 阅读 24 收藏 0 点赞 0 评论 0
def get_hs300s():
    """
    ????300??????????
    Return
    --------
    DataFrame
        code :????
        name :????
        date :??
        weight:??
    """
    from tushare.stock.fundamental import get_stock_basics
    try:
        wt = pd.read_excel(ct.HS300_CLASSIFY_URL_FTP%(ct.P_TYPE['ftp'], ct.DOMAINS['idxip'], 
                                                  ct.PAGES['hs300w']), parse_cols=[0, 3, 6])
        wt.columns = ct.FOR_CLASSIFY_W_COLS
        wt['code'] = wt['code'].map(lambda x :str(x).zfill(6))
        df = get_stock_basics()[['name']]
        df = df.reset_index()
        return pd.merge(df,wt)
    except Exception as er:
        print(str(er))
classifying.py 文件源码 项目:TuShare 作者: andyzsf 项目源码 文件源码 阅读 40 收藏 0 点赞 0 评论 0
def get_zz500s():
    """
    ????500???
    Return
    --------
    DataFrame
        code :????
        name :????
    """
    from tushare.stock.fundamental import get_stock_basics
    try:
#         df = pd.read_excel(ct.HS300_CLASSIFY_URL_FTP%(ct.P_TYPE['ftp'], ct.DOMAINS['idxip'], 
#                                                   ct.PAGES['zz500b']), parse_cols=[0,1])
#         df.columns = ct.FOR_CLASSIFY_B_COLS
#         df['code'] = df['code'].map(lambda x :str(x).zfill(6))
        wt = pd.read_excel(ct.HS300_CLASSIFY_URL_FTP%(ct.P_TYPE['ftp'], ct.DOMAINS['idxip'], 
                                                   ct.PAGES['zz500wt']), parse_cols=[0, 3, 6])
        wt.columns = ct.FOR_CLASSIFY_W_COLS
        wt['code'] = wt['code'].map(lambda x :str(x).zfill(6))
        df = get_stock_basics()[['name']]
        df = df.reset_index()
        return pd.merge(df,wt)
    except Exception as er:
        print(str(er))
test_overlap_filter.py 文件源码 项目:py_stringsimjoin 作者: anhaidgroup 项目源码 文件源码 阅读 61 收藏 0 点赞 0 评论 0
def setUp(self):
        self.dlm = DelimiterTokenizer(delim_set=[' '], return_set=True)
        self.A = pd.DataFrame([{'l_id': 1, 'l_attr':'ab cd ef aa bb'},
                               {'l_id': 2, 'l_attr':''},
                               {'l_id': 3, 'l_attr':'ab'},
                               {'l_id': 4, 'l_attr':'ll oo pp'},
                               {'l_id': 5, 'l_attr':'xy xx zz fg'},
                               {'l_id': 6, 'l_attr':pd.np.NaN}])
        self.B = pd.DataFrame([{'r_id': 1, 'r_attr':'mn'},
                               {'r_id': 2, 'r_attr':'he ll'},
                               {'r_id': 3, 'r_attr':'xy pl ou'},
                               {'r_id': 4, 'r_attr':'aa'},
                               {'r_id': 5, 'r_attr':'fg cd aa ef'},
                               {'r_id': 6, 'r_attr':None}])

        # generate cartesian product A x B to be used as candset
        self.A['tmp_join_key'] = 1
        self.B['tmp_join_key'] = 1
        self.C = pd.merge(self.A[['l_id', 'tmp_join_key']],
                          self.B[['r_id', 'tmp_join_key']],
                     on='tmp_join_key').drop('tmp_join_key', 1)

        self.empty_A = pd.DataFrame(columns=['l_id', 'l_attr'])
        self.empty_B = pd.DataFrame(columns=['r_id', 'r_attr'])
        self.empty_candset = pd.DataFrame(columns=['l_id', 'r_id'])
feature_table_builder.py 文件源码 项目:johnson-county-ddj-public 作者: dssg 项目源码 文件源码 阅读 25 收藏 0 点赞 0 评论 0
def label_feature_producer(start_date, end_date, features, labels):
    labeller = Labeller(start_date, end_date, labels)
    dataset = labeller.get_labels()
    dataset['training_end_date'] = start_date
    dataset['labeling_end_date'] = end_date
    # dataset
    feature_grabber = feature_processor.FeatureGrabber(start_date, engine,config_db, con)
    feature_name_dictionary = {}

    #print dataset
    for feature in features:
            #print feature
        res_training, feature_names_training = feature_grabber.getFeature(feature)
        feature_name_dictionary[feature] = feature_names_training
        res_training.drop_duplicates(inplace = True)
        dataset = pd.merge(dataset, res_training, on = config_db['id_column'],
                           how = 'left')

    return dataset, feature_name_dictionary

    #return None, None
chapter_02.py 文件源码 项目:Python_Study 作者: thsheep 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def foo1():
    # pandas????????????????????????
    data = pd.merge(pd.merge(ratings, users), movies)

    # ??????????????
    # ??????DataFrame??????????????????????????
    mean_rating = data.pivot_table('rating', columns='gender', index='title', aggfunc='mean')

    # ?????????250????
    ratings_by_title = data.groupby('title').size()
    active_titles = ratings_by_title.index[ratings_by_title >= 250]
    # ??????????250?????????????????mean_rating???????
    mean_rating = mean_rating.ix[active_titles]
    # ??????????????????F?????
    top_female_ratings = mean_rating.sort_index(by='F', ascending=False)
    print(top_female_ratings)
computeengine.py 文件源码 项目:loman 作者: janusassetallocation 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def to_df(self):
        """
        Get a dataframe containing the states and value of all nodes of computation

        ::

            >>> comp = loman.Computation()
            >>> comp.add_node('foo', value=1)
            >>> comp.add_node('bar', value=2)
            >>> comp.to_df()
                           state  value  is_expansion
            bar  States.UPTODATE      2           NaN
            foo  States.UPTODATE      1           NaN
        """
        df = pd.DataFrame(index=nx.topological_sort(self.dag))
        df[_AN_STATE] = pd.Series(nx.get_node_attributes(self.dag, _AN_STATE))
        df[_AN_VALUE] = pd.Series(nx.get_node_attributes(self.dag, _AN_VALUE))
        df_timing = pd.DataFrame.from_dict(nx.get_node_attributes(self.dag, 'timing'), orient='index')
        df = pd.merge(df, df_timing, left_index=True, right_index=True, how='left')
        return df
ui.py 文件源码 项目:dnflow 作者: DocNow 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def hashtags_multi(search_id):
    ids = [search_id]
    ids.extend(request.args.getlist('id'))
    in_clause = ','.join([str(i) for i in ids])
    searches = query("""
        SELECT id, date_path, text
        FROM searches WHERE id in (%s)
        """ % in_clause)
    summary = []
    search = searches[0]
    summary.append({'id': search['id'], 'date_path': search['date_path'],
                    'text': search['text'],
                    'colname': 'count_%s' % search['id']})
    d = pd.read_csv('data/%s/count-hashtags.csv' % search['date_path'])
    d = d.rename(columns={'count': 'count_%s' % search['id']})
    for search in searches[1:]:
        summary.append({'id': search['id'], 'date_path': search['date_path'],
                        'text': search['text'],
                        'colname': 'count_%s' % search['id']})
        e = pd.read_csv('data/%s/count-hashtags.csv' % search['date_path'])
        e = e.rename(columns={'count': 'count_%s' % search['id']})
        d = pd.merge(d, e, on='hashtag', how='outer').fillna(0)
    d.sort_values(by='count_%s' % search_id, inplace=True, ascending=False)
    result = {'summary': summary, 'hashtags': d.to_dict(orient='record')}
    return jsonify(result)
rt_censor_diagnosis.py 文件源码 项目:pyPheWAS 作者: BennettLandman 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def censor_diagnosis(genotype_file,phenotype_file,final_pfile, field ='na',start_time=float('nan'),end_time=float('nan')):
        import pandas as pd
        import numpy as np
        genotypes = pd.read_csv(genotype_file)
        phenotypes = pd.read_csv(phenotype_file)
        mg=pd.merge(phenotypes,genotypes,on='id')
        if np.isnan(start_time) and np.isnan(end_time):
                print("Choose appropriate time period")
        if field=='na':
                if np.isfinite(start_time) and np.isnan(end_time):
                        final = mg[mg['AgeAtICD']>start_time]
                elif np.isnan(start_time) and np.isfinite(end_time):
                        final = mg[mg['AgeAtICD']<end_time]
                else:
                        final = mg[(mg['AgeAtICD']>start_time)&(mg['AgeAtICD']<end_time)]

        else:
                mg['diff']=mg[field]-mg['AgeAtICD']
                if np.isfinite(start_time) and np.isnan(end_time):
                        final = mg[(mg['diff']>start_time)|(np.isnan(mg['diff']))]
                elif np.isnan(start_time) and np.isfinite(end_time):
                        final = mg[(mg['diff']<end_time)|(np.isnan(mg['diff']))]
                else:
                        final = mg[(mg['diff']>start_time)&(mg['diff']<end_time)|(np.isnan(mg['diff']))]
        final[['id','icd9','AgeAtICD']].to_csv(final_pfile)
movielens_preprocess.py 文件源码 项目:striatum 作者: ntucllab 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def main():
    # read and preprocess the movie data
    movie = pd.read_table('movies.dat', sep='::', names=['movie_id', 'movie_name', 'tag'], engine='python')
    movie = movie_preprocessing(movie)

    # read the ratings data and merge it with movie data
    rating = pd.read_table("ratings.dat", sep="::",
                           names=["user_id", "movie_id", "rating", "timestamp"], engine='python')
    data = pd.merge(rating, movie, on="movie_id")

    # extract feature from our data set
    streaming_batch, user_feature, actions, reward_list = feature_extraction(data)
    streaming_batch.to_csv("streaming_batch.csv", sep='\t', index=False)
    user_feature.to_csv("user_feature.csv", sep='\t')
    pd.DataFrame(actions, columns=['movie_id']).to_csv("actions.csv", sep='\t', index=False)
    reward_list.to_csv("reward_list.csv", sep='\t', index=False)

    action_context = movie[movie['movie_id'].isin(actions)]
    action_context.to_csv("action_context.csv", sep='\t', index = False)


问题


面经


文章

微信
公众号

扫码关注公众号