python类concat()的实例源码

gradient_boosting.py 文件源码 项目:HousePricePredictionKaggle 作者: Nuwantha 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def data_preprocess(train,test):
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx],inplace=True)
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                          test.loc[:,'MSSubClass':'SaleCondition']))

    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data = all_data.drop(to_delete,axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    #log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train,X_test,y
RandomForest.py 文件源码 项目:HousePricePredictionKaggle 作者: Nuwantha 项目源码 文件源码 阅读 40 收藏 0 点赞 0 评论 0
def data_preprocess(train,test):
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx],inplace=True)
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                          test.loc[:,'MSSubClass':'SaleCondition']))

    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data = all_data.drop(to_delete,axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    #log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train,X_test,y
ensemble_stacking.py 文件源码 项目:HousePricePredictionKaggle 作者: Nuwantha 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def data_preprocess(train, test):
    outlier_idx = [4, 11, 13, 20, 46, 66, 70, 167, 178, 185, 199, 224, 261, 309, 313, 318, 349, 412, 423, 440, 454, 477,
                   478, 523, 540, 581, 588, 595, 654, 688, 691, 774, 798, 875, 898, 926, 970, 987, 1027, 1109, 1169,
                   1182, 1239, 1256, 1298, 1324, 1353, 1359, 1405, 1442, 1447]
    train.drop(train.index[outlier_idx], inplace=True)
    all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
                          test.loc[:, 'MSSubClass':'SaleCondition']))

    to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
    all_data = all_data.drop(to_delete, axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    # log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))  # compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(method='ffill')
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train, X_test, y
run_mpi.py 文件源码 项目:pylspm 作者: lseman 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def do_work_pso(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
    output = pd.DataFrame(population[item].position)
    output.columns = ['Split']
    dataSplit = pd.concat([data, output], axis=1)
    f1 = []
    results = []
    for i in range(nclusters):
        dataSplited = (dataSplit.loc[dataSplit['Split']
                                     == i]).drop('Split', axis=1)
        dataSplited.index = range(len(dataSplited))

        try:
            results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
                                  reg, 0, 50, HOC='true'))

            resid = results[i].residuals()[3]
            f1.append(resid)
        except:
            f1.append(10000)
#    print((1 / np.sum(f1)))
    return (1 / np.sum(f1))
run_mpi.py 文件源码 项目:pylspm 作者: lseman 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def do_work_ga(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
    output = pd.DataFrame(population[item].genes)
    output.columns = ['Split']
    dataSplit = pd.concat([data, output], axis=1)
    f1 = []
    results = []
    for i in range(nclusters):
        dataSplited = (dataSplit.loc[dataSplit['Split']
                                     == i]).drop('Split', axis=1)
        dataSplited.index = range(len(dataSplited))

        try:
            results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
                                  reg, 0, 50, HOC='true'))

            resid = results[i].residuals()[3]
            f1.append(resid)
        except:
            f1.append(10000)
    return (1 / np.sum(f1))

# Main
boot_mpi.py 文件源码 项目:pylspm 作者: lseman 项目源码 文件源码 阅读 38 收藏 0 点赞 0 评论 0
def do_work_pso(data, LVcsv, Mcsv, scheme, reg, h, maximo):
    output = pd.DataFrame(population[item].position)
    output.columns = ['Split']
    dataSplit = pd.concat([data, output], axis=1)
    f1 = []
    results = []
    for i in range(nclusters):
        dataSplited = (dataSplit.loc[dataSplit['Split']
                                     == i]).drop('Split', axis=1)
        dataSplited.index = range(len(dataSplited))

        try:
            results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
                                  reg, 0, 50, HOC='true'))

            resid = results[i].residuals()[3]
            f1.append(resid)
        except:
            f1.append(10000)
    print((1 / np.sum(f1)))
    return (1 / np.sum(f1))
boot.py 文件源码 项目:pylspm 作者: lseman 项目源码 文件源码 阅读 39 收藏 0 点赞 0 评论 0
def do_work_ga(self, item):
        output = pd.DataFrame(self.population[item].genes)
        output.columns = ['Split']
        dataSplit = pd.concat([self.data, output], axis=1)
        f1 = []
        results = []
        for i in range(self.nclusters):
            dataSplited = (dataSplit.loc[dataSplit['Split']
                                         == i]).drop('Split', axis=1)
            dataSplited.index = range(len(dataSplited))

            try:
                results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
                                      self.reg, 0, 50, HOC='true'))

                resid = results[i].residuals()[3]
                f1.append(resid)
            except:
                f1.append(10000)
        print((1 / np.sum(f1)))
        return (1 / np.sum(f1))
boot.py 文件源码 项目:pylspm 作者: lseman 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def do_work_pso(self, item):
        output = pd.DataFrame(self.population[item].position)
        output.columns = ['Split']
        dataSplit = pd.concat([self.data, output], axis=1)
        f1 = []
        results = []
        for i in range(self.nclusters):
            dataSplited = (dataSplit.loc[dataSplit['Split']
                                         == i]).drop('Split', axis=1)
            dataSplited.index = range(len(dataSplited))

            try:
                results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
                                      self.reg, 0, 50, HOC='true'))

                resid = results[i].residuals()[3]
                f1.append(resid)
            except:
                f1.append(10000)
        print((1 / np.sum(f1)))
        return (1 / np.sum(f1))
discover.py 文件源码 项目:IgDiscover 作者: NBISweden 项目源码 文件源码 阅读 63 收藏 0 点赞 0 评论 0
def merged(self, s, t):
        chars = []
        for c1, c2 in zip_longest(s.sequence, t.sequence):
            if c1 is None:
                c = c2
            elif c2 is None:
                c = c1
            elif c1 == 'N':
                c = c2
            elif c2 == 'N':
                c = c1
            elif c1 != c2:
                return None
            else:
                assert c1 == c2
                c = c1
            chars.append(c)
        seq = ''.join(chars)
        requested = s.requested or t.requested
        name = s.name + ';' + t.name
        # take union of groups
        group = pd.concat([s.group, t.group]).groupby(level=0).last()
        return SiblingInfo(seq, requested, name, group)
tracker.py 文件源码 项目:zipline-chinese 作者: zhanghan1990 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def update_dividends(self, new_dividends):
        """
        Update our dividend frame with new dividends.  @new_dividends should be
        a DataFrame with columns containing at least the entries in
        zipline.protocol.DIVIDEND_FIELDS.
        """

        # Mark each new dividend with a unique integer id.  This ensures that
        # we can differentiate dividends whose date/sid fields are otherwise
        # identical.
        new_dividends['id'] = np.arange(
            self._dividend_count,
            self._dividend_count + len(new_dividends),
        )
        self._dividend_count += len(new_dividends)

        self.dividend_frame = sort_values(pd.concat(
            [self.dividend_frame, new_dividends]
        ), ['pay_date', 'ex_date']).set_index('id', drop=False)
test_buyback_auth.py 文件源码 项目:zipline-chinese 作者: zhanghan1990 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def pipeline_event_loader_args(self, dates):
        _, mapping = super(
            BlazeCashBuybackAuthLoaderTestCase,
            self,
        ).pipeline_event_loader_args(dates)
        return (bz.data(pd.concat(
            pd.DataFrame({
                BUYBACK_ANNOUNCEMENT_FIELD_NAME:
                    frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME],
                CASH_FIELD_NAME:
                    frame[CASH_FIELD_NAME],
                TS_FIELD_NAME:
                    frame[TS_FIELD_NAME],
                SID_FIELD_NAME: sid,
            })
            for sid, frame in iteritems(mapping)
        ).reset_index(drop=True)),)
test_buyback_auth.py 文件源码 项目:zipline-chinese 作者: zhanghan1990 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def pipeline_event_loader_args(self, dates):
        _, mapping = super(
            BlazeShareBuybackAuthLoaderTestCase,
            self,
        ).pipeline_event_loader_args(dates)
        return (bz.data(pd.concat(
            pd.DataFrame({
                BUYBACK_ANNOUNCEMENT_FIELD_NAME:
                    frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME],
                SHARE_COUNT_FIELD_NAME:
                    frame[SHARE_COUNT_FIELD_NAME],
                TS_FIELD_NAME:
                    frame[TS_FIELD_NAME],
                SID_FIELD_NAME: sid,
            })
            for sid, frame in iteritems(mapping)
        ).reset_index(drop=True)),)
xlsx_usage.py 文件源码 项目:table-compositor 作者: InvestmentSystems 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def load_names_data():
    fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
    if not os.path.exists(fp):
        r = requests.get(URL_NAMES)
        with open(fp, 'wb') as f:
            f.write(r.content)

    post = collections.OrderedDict()
    with zipfile.ZipFile(fp) as zf:
        # get ZipInfo instances
        for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
            fn = zi.filename
            if fn.startswith('yob'):
                year = int(fn[3:7])
                df = pd.read_csv(
                    zf.open(zi),
                    header=None,
                    names=('name', 'gender', 'count'))
                df['year'] = year
                post[year] = df

        df = pd.concat(post.values())
        df.set_index('name', inplace=True, drop=True)
        return df
html_usage.py 文件源码 项目:table-compositor 作者: InvestmentSystems 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def load_names_data():
    fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
    if not os.path.exists(fp):
        r = requests.get(URL_NAMES)
        with open(fp, 'wb') as f:
            f.write(r.content)

    post = collections.OrderedDict()
    with zipfile.ZipFile(fp) as zf:
        # get ZipInfo instances
        for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
            fn = zi.filename
            if fn.startswith('yob'):
                year = int(fn[3:7])
                df = pd.read_csv(
                    zf.open(zi),
                    header=None,
                    names=('name', 'gender', 'count'))
                df['year'] = year
                post[year] = df

        df = pd.concat(post.values())
        df.set_index('name', inplace=True, drop=True)
        return df
soccerstan.py 文件源码 项目:soccerstan 作者: Torvaney 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def read_data(fname):
    """ Read football-data.co.uk csv """
    data = (
        pd.read_csv(fname)
        .rename(columns={
                'HomeTeam': 'home_team',
                'AwayTeam': 'away_team',
                'FTHG': 'home_goals',
                'FTAG': 'away_goals'
            })
        .loc[lambda df: ~pd.isnull(df['home_goals'])]  # Remove future games
    )

    team_map = stan_map(pd.concat([data['home_team'], data['away_team']]))
    data['home_team_id'] = data['home_team'].replace(team_map)
    data['away_team_id'] = data['away_team'].replace(team_map)


    for col in ('home_goals', 'away_goals'):
        data[col] = [int(c) for c in data[col]]

    return data, team_map
QATdx.py 文件源码 项目:QUANTAXIS 作者: yutiansut 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def QA_fetch_get_security_bars(code, _type, lens, ip=best_ip['stock'], port=7709):
    api = TdxHq_API()
    with api.connect(ip, port):
        data = pd.concat([api.to_df(api.get_security_bars(_select_type(_type), _select_market_code(
            code), code, (i - 1) * 800, 800)) for i in range(1, int(lens / 800) + 2)], axis=0)
        data = data\
            .assign(datetime=pd.to_datetime(data['datetime']), code=str(code))\
            .drop(['year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=False)\
            .assign(date=data['datetime'].apply(lambda x: str(x)[0:10]))\
            .assign(date_stamp=data['datetime'].apply(lambda x: QA_util_date_stamp(x)))\
            .assign(time_stamp=data['datetime'].apply(lambda x: QA_util_time_stamp(x)))\
            .assign(type=_type).set_index('datetime', drop=False, inplace=False).tail(lens)
        if data is not None:
            return data
        else:
            return None
QATdx.py 文件源码 项目:QUANTAXIS 作者: yutiansut 项目源码 文件源码 阅读 28 收藏 0 点赞 0 评论 0
def QA_fetch_get_stock_block(ip=best_ip['stock'], port=7709):
    '????'
    api = TdxHq_API()
    with api.connect(ip, port):

        data = pd.concat([api.to_df(api.get_and_parse_block_info("block_gn.dat")).assign(type='gn'),
                          api.to_df(api.get_and_parse_block_info(
                              "block.dat")).assign(type='yb'),
                          api.to_df(api.get_and_parse_block_info(
                              "block_zs.dat")).assign(type='zs'),
                          api.to_df(api.get_and_parse_block_info("block_fg.dat")).assign(type='fg')])

        if len(data) > 10:
            return data.assign(source='tdx').drop(['block_type', 'code_index'], axis=1).set_index('code', drop=False, inplace=False).drop_duplicates()
        else:
            QA_util_log_info('Wrong with fetch block ')
QATdx.py 文件源码 项目:QUANTAXIS 作者: yutiansut 项目源码 文件源码 阅读 31 收藏 0 点赞 0 评论 0
def QA_fetch_get_future_day(code, start_date, end_date, level='day', ip=best_ip['future'], port=7727):
    '???? ??'

    apix = TdxExHq_API()
    start_date = str(start_date)[0:10]
    today_ = datetime.date.today()
    lens = QA_util_get_trade_gap(start_date, today_)
    global extension_market_info
    extension_market_info=QA_fetch_get_future_list() if extension_market_info is None else extension_market_info

    with apix.connect(ip, port):
        code_market = extension_market_info.query('code=="{}"'.format(code))

        data = pd.concat([apix.to_df(apix.get_instrument_bars(_select_type(
            level), int(code_market.market), str(code),(int(lens / 700) - i) * 700, 700))for i in range(int(lens / 700) + 1)], axis=0)
        data = data.assign(date=data['datetime'].apply(lambda x: str(x[0:10]))).assign(code=str(code))\
            .assign(date_stamp=data['datetime'].apply(lambda x: QA_util_date_stamp(str(x)[0:10]))).set_index('date', drop=False, inplace=False)

        return data.drop(['year', 'month', 'day', 'hour', 'minute', 'datetime'], axis=1)[start_date:end_date].assign(date=data['date'].apply(lambda x: str(x)[0:10]))
data_fq.py 文件源码 项目:QUANTAXIS 作者: yutiansut 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def QA_data_make_qfq(bfq_data, xdxr_data):
    '???????????'
    info = xdxr_data[xdxr_data['category'] == 1]
    bfq_data['if_trade'] = 1
    data = pd.concat([bfq_data, info[['category']]
                      [bfq_data.index[0]:bfq_data.index[-1]]], axis=1)
    data['if_trade'].fillna(value=0, inplace=True)
    data = data.fillna(method='ffill')
    data = pd.concat([data, info[['fenhong', 'peigu', 'peigujia',
                                  'songzhuangu']][bfq_data.index[0]:bfq_data.index[-1]]], axis=1)
    data = data.fillna(0)
    data['preclose'] = (data['close'].shift(1) * 10 - data['fenhong'] + data['peigu']
                        * data['peigujia']) / (10 + data['peigu'] + data['songzhuangu'])
    data['adj'] = (data['preclose'].shift(-1) /
                   data['close']).fillna(1)[::-1].cumprod()
    data['open'] = data['open'] * data['adj']
    data['high'] = data['high'] * data['adj']
    data['low'] = data['low'] * data['adj']
    data['close'] = data['close'] * data['adj']
    data['preclose'] = data['preclose'] * data['adj']

    return data.query('if_trade==1').drop(['fenhong', 'peigu', 'peigujia', 'songzhuangu',
                                           'if_trade', 'category'], axis=1).query("open != 0")
data_fq.py 文件源码 项目:QUANTAXIS 作者: yutiansut 项目源码 文件源码 阅读 27 收藏 0 点赞 0 评论 0
def QA_data_make_hfq(bfq_data, xdxr_data):
    '???????????'
    info = xdxr_data[xdxr_data['category'] == 1]
    bfq_data['if_trade'] = 1
    data = pd.concat([bfq_data, info[['category']]
                      [bfq_data.index[0]:bfq_data.index[-1]]], axis=1)

    data['if_trade'].fillna(value=0, inplace=True)
    data = data.fillna(method='ffill')

    data = pd.concat([data, info[['fenhong', 'peigu', 'peigujia',
                                  'songzhuangu']][bfq_data.index[0]:bfq_data.index[-1]]], axis=1)

    data = data.fillna(0)
    data['preclose'] = (data['close'].shift(1) * 10 - data['fenhong'] + data['peigu']
                        * data['peigujia']) / (10 + data['peigu'] + data['songzhuangu'])
    data['adj'] = (data['preclose'].shift(-1) /
                   data['close']).fillna(1).cumprod()
    data['open'] = data['open'] / data['adj']
    data['high'] = data['high'] / data['adj']
    data['low'] = data['low'] / data['adj']
    data['close'] = data['close'] / data['adj']
    data['preclose'] = data['preclose'] / data['adj']
    return data.query('if_trade==1').drop(['fenhong', 'peigu', 'peigujia', 'songzhuangu'], axis=1).query("open != 0")
fe.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 26 收藏 0 点赞 0 评论 0
def get_text_len(DB, tr, te):
    if tr is None:
        if te=='stage1':
            Data = [DB.data['training_text'],DB.data['test_text_filter']]
        else:
            Data = [pd.concat([DB.data['training_text'],DB.data['test_text_filter']],axis=0),DB.data['stage2_test_text']]
    else:
        Data = [DB.data['training_text']]
    for data in Data:
        data['tl'] = data['Text'].apply(lambda x:len(x))
        data['tl2'] = data['Text'].apply(lambda x:len(x.split()))
    if tr is None:
        X,Xt = Data
        return X[['tl','tl2']].values, Xt[['tl','tl2']].values
    else:
        X = Data[0][['tl','tl2']].values
        return X[tr],X[te]
fe.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 29 收藏 0 点赞 0 评论 0
def get_pattern(DB,tr,te,patterns):
    cols = ['p%d'%c for c,p in enumerate(patterns)]
    if tr is None:
        test = DB.data['test_variants_filter'] if te=='stage1' else DB.data['stage2_test_variants']
        if te=='stage1':
            train = DB.data['training_variants']
        else:
            train = pd.concat([DB.data['training_variants'],DB.data["test_variants_filter"]],axis=0)
        Data =[train,test]
    else:
        Data = [DB.data['training_variants']]

    for data in Data:
        for c,p in enumerate(patterns):
            data['p%d'%c] = data['Variation'].apply(lambda x: len(re.findall(p,str(x).lower())))

    if tr is None:
        return train[cols].values,test[cols].values
    else:
        X = data[cols].values
        return X[tr],X[te]
fe.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 35 收藏 0 点赞 0 评论 0
def onehot_gene(DB, tr, te):
    from utils.np_utils.encoder import onehot_encode
    if tr is None:
        train = DB.data['training_variants']
        if te=="stage1":
            test = DB.data['test_variants_filter']
        else:
            train = pd.concat([train,DB.data['test_variants_filter']],axis=0)
            test = DB.data['stage2_test_variants']
        lbl_encode(train,test)
        n = max(train['Gene'].max(),test['Gene'].max())
        gtr = onehot_encode(train['Gene'].values,n=n+1)
        gte = onehot_encode(test['Gene'].values)
        return gtr,gte
    else:
        data = DB.data['training_variants']
        lbl_encode(data,cols=['Gene'])
        gene = data['Gene'].values
        gene = onehot_encode(gene)
        return gene[tr],gene[te]
xgb.py 文件源码 项目:kaggle-review 作者: daxiongshu 项目源码 文件源码 阅读 37 收藏 0 点赞 0 评论 0
def post_cv(flags):
    import re
    import os
    path = flags.data_path
    files = [i for i in os.listdir(path) if len(re.findall('cv_[0-9].csv',i))]
    s = []
    for name in files:
        s.append(pd.read_csv("%s/%s"%(path,name)))

    s = pd.concat(s,axis=0)
    print(s.head())
    classes = len([i for i in s.columns.values if 'class' in i])
    from utils.np_utils.utils import cross_entropy
    yp = s[['class%d'%i for i in range(1,classes+1)]].values
    y=s['real'].values
    print(cross_entropy(y,yp))
    s.to_csv("%s/cv.csv"%path,index=False)
gos.py 文件源码 项目:GOS 作者: crcresearch 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def create_agents(self, generator):
        """
        Given information on a set of countries and a generator function,
        generate the agents and assign the results to ``self.agents``.

        :type generator: DataFrame, str, int
        :param generator: A function which generates the agents.
        """
        self.generator = generator
        country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
        country_array.index = range(len(country_array))
        # Garbage collect before creating new processes.
        gc.collect()
        self.agents = pd.concat(
            self.pool.imap(self._gen_agents,
                           np.array_split(country_array, self.processes * self.splits))
        )
        self.agents.index = range(len(self.agents))
gos.py 文件源码 项目:GOS 作者: crcresearch 项目源码 文件源码 阅读 43 收藏 0 点赞 0 评论 0
def create_agents(self, generator):
        """
        Given information on a set of countries and a generator function,
        generate the agents and assign the results to ``self.agents``.

        :type generator: DataFrame, str, int
        :param generator: A function which generates the agents.
        """
        self.generator = generator
        country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
        country_array.index = range(len(country_array))
        # Garbage collect before creating new processes.
        gc.collect()
        self.agents = pd.concat(
            self.pool.imap(self._gen_agents,
                           np.array_split(country_array, self.processes * self.splits))
        )
        self.agents.index = range(len(self.agents))
GBREncoding.py 文件源码 项目:kaggle 作者: RankingAI 项目源码 文件源码 阅读 33 收藏 0 点赞 0 评论 0
def OHETr(self, tr):
        """"""
        OHEDict = {}
        for col in tr.columns:
            ValueCounts = [str(int(v)) for v in tr[col].value_counts().index.values]
            ValueCounts.append('missing')
            SelectedValues = dict((k, v) for (v, k) in enumerate(ValueCounts, start=0))
            OHTr = self.__ApplyOH(tr[col].values, SelectedValues)

            headers = dict((('%s_%s' % (col, k)), SelectedValues[k]) for k in SelectedValues)
            tmp = [v[0] for v in sorted(headers.items(), key=lambda x: x[1])]
            OHDFTr = pd.DataFrame(OHTr, index=tr.index, columns=tmp)

            tr = pd.concat([tr, OHDFTr], axis=1)

            tr.drop(col, axis=1, inplace=True)
            OHEDict[col] = SelectedValues
            #print('Column %s was encoded.' % col)

        return tr, OHEDict
helperFuncs.py 文件源码 项目:PersonalizedMultitaskLearning 作者: mitmedialab 项目源码 文件源码 阅读 34 收藏 0 点赞 0 评论 0
def combineFilesIntoDf(file_path, filenames, reset_index=False, drop_cols=None):
    df = None
    for filename in filenames:
        fdf = pd.DataFrame.from_csv(file_path + filename)

        if reset_index:
            fdf = fdf.reset_index()

        if df is None:
            df = fdf.copy(deep=True)
        else:
            df = pd.concat([df,fdf])

    if drop_cols is not None:
        for feat in drop_cols:
            df = df.drop(feat, 1)

    return df
recipe_clustering.py 文件源码 项目:Flavor-Network 作者: lingcheng99 项目源码 文件源码 阅读 43 收藏 0 点赞 0 评论 0
def tsne_cluster_cuisine(df,sublist):
    lenlist=[0]
    df_sub = df[df['cuisine']==sublist[0]]
    lenlist.append(df_sub.shape[0])
    for cuisine in sublist[1:]:
        temp = df[df['cuisine']==cuisine]
        df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True)
        lenlist.append(df_sub.shape[0])
    df_X = df_sub.drop(['cuisine','recipeName'],axis=1)
    print df_X.shape, lenlist

    dist = squareform(pdist(df_X, metric='cosine'))
    tsne = TSNE(metric='precomputed').fit_transform(dist)

    palette = sns.color_palette("hls", len(sublist))
    plt.figure(figsize=(10,10))
    for i,cuisine in enumerate(sublist):
        plt.scatter(tsne[lenlist[i]:lenlist[i+1],0],\
        tsne[lenlist[i]:lenlist[i+1],1],c=palette[i],label=sublist[i])
    plt.legend()

#interactive plot with boken; set up for four categories, with color palette; pass in df for either ingredient or flavor
dataset.py 文件源码 项目:sanergy-public 作者: dssg 项目源码 文件源码 阅读 30 收藏 0 点赞 0 评论 0
def create_future(fold, features_old, cfg_parameters):
    """
    Just for testing purposes.
    Sets up a replicate of the last day(s) data to create new data for testing. But in reality,
    we should be able to create features for the upcoming days from past data, so this would not be needed???
    """
    last_day = fold['window_end']
    next_days = [last_day + timedelta(days=i) for i in xrange(1,(cfg_parameters['prediction_horizon'] +1 ))]
    old_features_unique = features_old.drop_duplicates(subset='ToiletID')
    l_future_features = []
    for day in  next_days:
        next_day_features = old_features_unique.copy()
        next_day_features["Collection_Date"] = day
        l_future_features.append(next_day_features)
    future_features = pd.concat(l_future_features, ignore_index=True)
    return(future_features)


问题


面经


文章

微信
公众号

扫码关注公众号