python类concat()的实例源码-面圈网

gradient_boosting.py 文件源码项目：HousePricePredictionKaggle 作者: Nuwantha 项目源码文件源码阅读 49 收藏 0 点赞 0 评论 0

def data_preprocess(train,test):
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx],inplace=True)
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                          test.loc[:,'MSSubClass':'SaleCondition']))

    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data = all_data.drop(to_delete,axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    #log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train,X_test,y

RandomForest.py 文件源码项目：HousePricePredictionKaggle 作者: Nuwantha 项目源码文件源码阅读 59 收藏 0 点赞 0 评论 0

def data_preprocess(train,test):
    outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
    train.drop(train.index[outlier_idx],inplace=True)
    all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                          test.loc[:,'MSSubClass':'SaleCondition']))

    to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
    all_data = all_data.drop(to_delete,axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    #log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(all_data.mean())
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train,X_test,y

ensemble_stacking.py 文件源码项目：HousePricePredictionKaggle 作者: Nuwantha 项目源码文件源码阅读 44 收藏 0 点赞 0 评论 0

def data_preprocess(train, test):
    outlier_idx = [4, 11, 13, 20, 46, 66, 70, 167, 178, 185, 199, 224, 261, 309, 313, 318, 349, 412, 423, 440, 454, 477,
                   478, 523, 540, 581, 588, 595, 654, 688, 691, 774, 798, 875, 898, 926, 970, 987, 1027, 1109, 1169,
                   1182, 1239, 1256, 1298, 1324, 1353, 1359, 1405, 1442, 1447]
    train.drop(train.index[outlier_idx], inplace=True)
    all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
                          test.loc[:, 'MSSubClass':'SaleCondition']))

    to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
    all_data = all_data.drop(to_delete, axis=1)

    train["SalePrice"] = np.log1p(train["SalePrice"])
    # log transform skewed numeric features
    numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))  # compute skewness
    skewed_feats = skewed_feats[skewed_feats > 0.75]
    skewed_feats = skewed_feats.index
    all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
    all_data = pd.get_dummies(all_data)
    all_data = all_data.fillna(method='ffill')
    X_train = all_data[:train.shape[0]]
    X_test = all_data[train.shape[0]:]
    y = train.SalePrice

    return X_train, X_test, y

run_mpi.py 文件源码项目：pylspm 作者: lseman 项目源码文件源码阅读 46 收藏 0 点赞 0 评论 0

def do_work_pso(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
    output = pd.DataFrame(population[item].position)
    output.columns = ['Split']
    dataSplit = pd.concat([data, output], axis=1)
    f1 = []
    results = []
    for i in range(nclusters):
        dataSplited = (dataSplit.loc[dataSplit['Split']
                                     == i]).drop('Split', axis=1)
        dataSplited.index = range(len(dataSplited))

        try:
            results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
                                  reg, 0, 50, HOC='true'))

            resid = results[i].residuals()[3]
            f1.append(resid)
        except:
            f1.append(10000)
#    print((1 / np.sum(f1)))
    return (1 / np.sum(f1))

run_mpi.py 文件源码项目：pylspm 作者: lseman 项目源码文件源码阅读 56 收藏 0 点赞 0 评论 0

def do_work_ga(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
    output = pd.DataFrame(population[item].genes)
    output.columns = ['Split']
    dataSplit = pd.concat([data, output], axis=1)
    f1 = []
    results = []
    for i in range(nclusters):
        dataSplited = (dataSplit.loc[dataSplit['Split']
                                     == i]).drop('Split', axis=1)
        dataSplited.index = range(len(dataSplited))

        try:
            results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
                                  reg, 0, 50, HOC='true'))

            resid = results[i].residuals()[3]
            f1.append(resid)
        except:
            f1.append(10000)
    return (1 / np.sum(f1))

# Main

boot_mpi.py 文件源码项目：pylspm 作者: lseman 项目源码文件源码阅读 49 收藏 0 点赞 0 评论 0

def do_work_pso(data, LVcsv, Mcsv, scheme, reg, h, maximo):
    output = pd.DataFrame(population[item].position)
    output.columns = ['Split']
    dataSplit = pd.concat([data, output], axis=1)
    f1 = []
    results = []
    for i in range(nclusters):
        dataSplited = (dataSplit.loc[dataSplit['Split']
                                     == i]).drop('Split', axis=1)
        dataSplited.index = range(len(dataSplited))

        try:
            results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
                                  reg, 0, 50, HOC='true'))

            resid = results[i].residuals()[3]
            f1.append(resid)
        except:
            f1.append(10000)
    print((1 / np.sum(f1)))
    return (1 / np.sum(f1))

boot.py 文件源码项目：pylspm 作者: lseman 项目源码文件源码阅读 63 收藏 0 点赞 0 评论 0

def do_work_ga(self, item):
        output = pd.DataFrame(self.population[item].genes)
        output.columns = ['Split']
        dataSplit = pd.concat([self.data, output], axis=1)
        f1 = []
        results = []
        for i in range(self.nclusters):
            dataSplited = (dataSplit.loc[dataSplit['Split']
                                         == i]).drop('Split', axis=1)
            dataSplited.index = range(len(dataSplited))

            try:
                results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
                                      self.reg, 0, 50, HOC='true'))

                resid = results[i].residuals()[3]
                f1.append(resid)
            except:
                f1.append(10000)
        print((1 / np.sum(f1)))
        return (1 / np.sum(f1))

boot.py 文件源码项目：pylspm 作者: lseman 项目源码文件源码阅读 45 收藏 0 点赞 0 评论 0

def do_work_pso(self, item):
        output = pd.DataFrame(self.population[item].position)
        output.columns = ['Split']
        dataSplit = pd.concat([self.data, output], axis=1)
        f1 = []
        results = []
        for i in range(self.nclusters):
            dataSplited = (dataSplit.loc[dataSplit['Split']
                                         == i]).drop('Split', axis=1)
            dataSplited.index = range(len(dataSplited))

            try:
                results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
                                      self.reg, 0, 50, HOC='true'))

                resid = results[i].residuals()[3]
                f1.append(resid)
            except:
                f1.append(10000)
        print((1 / np.sum(f1)))
        return (1 / np.sum(f1))

discover.py 文件源码项目：IgDiscover 作者: NBISweden 项目源码文件源码阅读 84 收藏 0 点赞 0 评论 0

def merged(self, s, t):
        chars = []
        for c1, c2 in zip_longest(s.sequence, t.sequence):
            if c1 is None:
                c = c2
            elif c2 is None:
                c = c1
            elif c1 == 'N':
                c = c2
            elif c2 == 'N':
                c = c1
            elif c1 != c2:
                return None
            else:
                assert c1 == c2
                c = c1
            chars.append(c)
        seq = ''.join(chars)
        requested = s.requested or t.requested
        name = s.name + ';' + t.name
        # take union of groups
        group = pd.concat([s.group, t.group]).groupby(level=0).last()
        return SiblingInfo(seq, requested, name, group)

tracker.py 文件源码项目：zipline-chinese 作者: zhanghan1990 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def update_dividends(self, new_dividends):
        """
        Update our dividend frame with new dividends.  @new_dividends should be
        a DataFrame with columns containing at least the entries in
        zipline.protocol.DIVIDEND_FIELDS.
        """

        # Mark each new dividend with a unique integer id.  This ensures that
        # we can differentiate dividends whose date/sid fields are otherwise
        # identical.
        new_dividends['id'] = np.arange(
            self._dividend_count,
            self._dividend_count + len(new_dividends),
        )
        self._dividend_count += len(new_dividends)

        self.dividend_frame = sort_values(pd.concat(
            [self.dividend_frame, new_dividends]
        ), ['pay_date', 'ex_date']).set_index('id', drop=False)

test_buyback_auth.py 文件源码项目：zipline-chinese 作者: zhanghan1990 项目源码文件源码阅读 44 收藏 0 点赞 0 评论 0

def pipeline_event_loader_args(self, dates):
        _, mapping = super(
            BlazeCashBuybackAuthLoaderTestCase,
            self,
        ).pipeline_event_loader_args(dates)
        return (bz.data(pd.concat(
            pd.DataFrame({
                BUYBACK_ANNOUNCEMENT_FIELD_NAME:
                    frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME],
                CASH_FIELD_NAME:
                    frame[CASH_FIELD_NAME],
                TS_FIELD_NAME:
                    frame[TS_FIELD_NAME],
                SID_FIELD_NAME: sid,
            })
            for sid, frame in iteritems(mapping)
        ).reset_index(drop=True)),)

test_buyback_auth.py 文件源码项目：zipline-chinese 作者: zhanghan1990 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def pipeline_event_loader_args(self, dates):
        _, mapping = super(
            BlazeShareBuybackAuthLoaderTestCase,
            self,
        ).pipeline_event_loader_args(dates)
        return (bz.data(pd.concat(
            pd.DataFrame({
                BUYBACK_ANNOUNCEMENT_FIELD_NAME:
                    frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME],
                SHARE_COUNT_FIELD_NAME:
                    frame[SHARE_COUNT_FIELD_NAME],
                TS_FIELD_NAME:
                    frame[TS_FIELD_NAME],
                SID_FIELD_NAME: sid,
            })
            for sid, frame in iteritems(mapping)
        ).reset_index(drop=True)),)

xlsx_usage.py 文件源码项目：table-compositor 作者: InvestmentSystems 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def load_names_data():
    fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
    if not os.path.exists(fp):
        r = requests.get(URL_NAMES)
        with open(fp, 'wb') as f:
            f.write(r.content)

    post = collections.OrderedDict()
    with zipfile.ZipFile(fp) as zf:
        # get ZipInfo instances
        for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
            fn = zi.filename
            if fn.startswith('yob'):
                year = int(fn[3:7])
                df = pd.read_csv(
                    zf.open(zi),
                    header=None,
                    names=('name', 'gender', 'count'))
                df['year'] = year
                post[year] = df

        df = pd.concat(post.values())
        df.set_index('name', inplace=True, drop=True)
        return df

html_usage.py 文件源码项目：table-compositor 作者: InvestmentSystems 项目源码文件源码阅读 45 收藏 0 点赞 0 评论 0

def load_names_data():
    fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
    if not os.path.exists(fp):
        r = requests.get(URL_NAMES)
        with open(fp, 'wb') as f:
            f.write(r.content)

    post = collections.OrderedDict()
    with zipfile.ZipFile(fp) as zf:
        # get ZipInfo instances
        for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
            fn = zi.filename
            if fn.startswith('yob'):
                year = int(fn[3:7])
                df = pd.read_csv(
                    zf.open(zi),
                    header=None,
                    names=('name', 'gender', 'count'))
                df['year'] = year
                post[year] = df

        df = pd.concat(post.values())
        df.set_index('name', inplace=True, drop=True)
        return df

soccerstan.py 文件源码项目：soccerstan 作者: Torvaney 项目源码文件源码阅读 45 收藏 0 点赞 0 评论 0

def read_data(fname):
    """ Read football-data.co.uk csv """
    data = (
        pd.read_csv(fname)
        .rename(columns={
                'HomeTeam': 'home_team',
                'AwayTeam': 'away_team',
                'FTHG': 'home_goals',
                'FTAG': 'away_goals'
            })
        .loc[lambda df: ~pd.isnull(df['home_goals'])]  # Remove future games
    )

    team_map = stan_map(pd.concat([data['home_team'], data['away_team']]))
    data['home_team_id'] = data['home_team'].replace(team_map)
    data['away_team_id'] = data['away_team'].replace(team_map)


    for col in ('home_goals', 'away_goals'):
        data[col] = [int(c) for c in data[col]]

    return data, team_map

QATdx.py 文件源码项目：QUANTAXIS 作者: yutiansut 项目源码文件源码阅读 42 收藏 0 点赞 0 评论 0

def QA_fetch_get_security_bars(code, _type, lens, ip=best_ip['stock'], port=7709):
    api = TdxHq_API()
    with api.connect(ip, port):
        data = pd.concat([api.to_df(api.get_security_bars(_select_type(_type), _select_market_code(
            code), code, (i - 1) * 800, 800)) for i in range(1, int(lens / 800) + 2)], axis=0)
        data = data\
            .assign(datetime=pd.to_datetime(data['datetime']), code=str(code))\
            .drop(['year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=False)\
            .assign(date=data['datetime'].apply(lambda x: str(x)[0:10]))\
            .assign(date_stamp=data['datetime'].apply(lambda x: QA_util_date_stamp(x)))\
            .assign(time_stamp=data['datetime'].apply(lambda x: QA_util_time_stamp(x)))\
            .assign(type=_type).set_index('datetime', drop=False, inplace=False).tail(lens)
        if data is not None:
            return data
        else:
            return None

QATdx.py 文件源码项目：QUANTAXIS 作者: yutiansut 项目源码文件源码阅读 41 收藏 0 点赞 0 评论 0

def QA_fetch_get_stock_block(ip=best_ip['stock'], port=7709):
    '????'
    api = TdxHq_API()
    with api.connect(ip, port):

        data = pd.concat([api.to_df(api.get_and_parse_block_info("block_gn.dat")).assign(type='gn'),
                          api.to_df(api.get_and_parse_block_info(
                              "block.dat")).assign(type='yb'),
                          api.to_df(api.get_and_parse_block_info(
                              "block_zs.dat")).assign(type='zs'),
                          api.to_df(api.get_and_parse_block_info("block_fg.dat")).assign(type='fg')])

        if len(data) > 10:
            return data.assign(source='tdx').drop(['block_type', 'code_index'], axis=1).set_index('code', drop=False, inplace=False).drop_duplicates()
        else:
            QA_util_log_info('Wrong with fetch block ')

QATdx.py 文件源码项目：QUANTAXIS 作者: yutiansut 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def QA_fetch_get_future_day(code, start_date, end_date, level='day', ip=best_ip['future'], port=7727):
    '???? ??'

    apix = TdxExHq_API()
    start_date = str(start_date)[0:10]
    today_ = datetime.date.today()
    lens = QA_util_get_trade_gap(start_date, today_)
    global extension_market_info
    extension_market_info=QA_fetch_get_future_list() if extension_market_info is None else extension_market_info

    with apix.connect(ip, port):
        code_market = extension_market_info.query('code=="{}"'.format(code))

        data = pd.concat([apix.to_df(apix.get_instrument_bars(_select_type(
            level), int(code_market.market), str(code),(int(lens / 700) - i) * 700, 700))for i in range(int(lens / 700) + 1)], axis=0)
        data = data.assign(date=data['datetime'].apply(lambda x: str(x[0:10]))).assign(code=str(code))\
            .assign(date_stamp=data['datetime'].apply(lambda x: QA_util_date_stamp(str(x)[0:10]))).set_index('date', drop=False, inplace=False)

        return data.drop(['year', 'month', 'day', 'hour', 'minute', 'datetime'], axis=1)[start_date:end_date].assign(date=data['date'].apply(lambda x: str(x)[0:10]))

data_fq.py 文件源码项目：QUANTAXIS 作者: yutiansut 项目源码文件源码阅读 45 收藏 0 点赞 0 评论 0

def QA_data_make_qfq(bfq_data, xdxr_data):
    '???????????'
    info = xdxr_data[xdxr_data['category'] == 1]
    bfq_data['if_trade'] = 1
    data = pd.concat([bfq_data, info[['category']]
                      [bfq_data.index[0]:bfq_data.index[-1]]], axis=1)
    data['if_trade'].fillna(value=0, inplace=True)
    data = data.fillna(method='ffill')
    data = pd.concat([data, info[['fenhong', 'peigu', 'peigujia',
                                  'songzhuangu']][bfq_data.index[0]:bfq_data.index[-1]]], axis=1)
    data = data.fillna(0)
    data['preclose'] = (data['close'].shift(1) * 10 - data['fenhong'] + data['peigu']
                        * data['peigujia']) / (10 + data['peigu'] + data['songzhuangu'])
    data['adj'] = (data['preclose'].shift(-1) /
                   data['close']).fillna(1)[::-1].cumprod()
    data['open'] = data['open'] * data['adj']
    data['high'] = data['high'] * data['adj']
    data['low'] = data['low'] * data['adj']
    data['close'] = data['close'] * data['adj']
    data['preclose'] = data['preclose'] * data['adj']

    return data.query('if_trade==1').drop(['fenhong', 'peigu', 'peigujia', 'songzhuangu',
                                           'if_trade', 'category'], axis=1).query("open != 0")

data_fq.py 文件源码项目：QUANTAXIS 作者: yutiansut 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def QA_data_make_hfq(bfq_data, xdxr_data):
    '???????????'
    info = xdxr_data[xdxr_data['category'] == 1]
    bfq_data['if_trade'] = 1
    data = pd.concat([bfq_data, info[['category']]
                      [bfq_data.index[0]:bfq_data.index[-1]]], axis=1)

    data['if_trade'].fillna(value=0, inplace=True)
    data = data.fillna(method='ffill')

    data = pd.concat([data, info[['fenhong', 'peigu', 'peigujia',
                                  'songzhuangu']][bfq_data.index[0]:bfq_data.index[-1]]], axis=1)

    data = data.fillna(0)
    data['preclose'] = (data['close'].shift(1) * 10 - data['fenhong'] + data['peigu']
                        * data['peigujia']) / (10 + data['peigu'] + data['songzhuangu'])
    data['adj'] = (data['preclose'].shift(-1) /
                   data['close']).fillna(1).cumprod()
    data['open'] = data['open'] / data['adj']
    data['high'] = data['high'] / data['adj']
    data['low'] = data['low'] / data['adj']
    data['close'] = data['close'] / data['adj']
    data['preclose'] = data['preclose'] / data['adj']
    return data.query('if_trade==1').drop(['fenhong', 'peigu', 'peigujia', 'songzhuangu'], axis=1).query("open != 0")

fe.py 文件源码项目：kaggle-review 作者: daxiongshu 项目源码文件源码阅读 37 收藏 0 点赞 0 评论 0

def get_text_len(DB, tr, te):
    if tr is None:
        if te=='stage1':
            Data = [DB.data['training_text'],DB.data['test_text_filter']]
        else:
            Data = [pd.concat([DB.data['training_text'],DB.data['test_text_filter']],axis=0),DB.data['stage2_test_text']]
    else:
        Data = [DB.data['training_text']]
    for data in Data:
        data['tl'] = data['Text'].apply(lambda x:len(x))
        data['tl2'] = data['Text'].apply(lambda x:len(x.split()))
    if tr is None:
        X,Xt = Data
        return X[['tl','tl2']].values, Xt[['tl','tl2']].values
    else:
        X = Data[0][['tl','tl2']].values
        return X[tr],X[te]

fe.py 文件源码项目：kaggle-review 作者: daxiongshu 项目源码文件源码阅读 40 收藏 0 点赞 0 评论 0

def get_pattern(DB,tr,te,patterns):
    cols = ['p%d'%c for c,p in enumerate(patterns)]
    if tr is None:
        test = DB.data['test_variants_filter'] if te=='stage1' else DB.data['stage2_test_variants']
        if te=='stage1':
            train = DB.data['training_variants']
        else:
            train = pd.concat([DB.data['training_variants'],DB.data["test_variants_filter"]],axis=0)
        Data =[train,test]
    else:
        Data = [DB.data['training_variants']]

    for data in Data:
        for c,p in enumerate(patterns):
            data['p%d'%c] = data['Variation'].apply(lambda x: len(re.findall(p,str(x).lower())))

    if tr is None:
        return train[cols].values,test[cols].values
    else:
        X = data[cols].values
        return X[tr],X[te]

fe.py 文件源码项目：kaggle-review 作者: daxiongshu 项目源码文件源码阅读 43 收藏 0 点赞 0 评论 0

def onehot_gene(DB, tr, te):
    from utils.np_utils.encoder import onehot_encode
    if tr is None:
        train = DB.data['training_variants']
        if te=="stage1":
            test = DB.data['test_variants_filter']
        else:
            train = pd.concat([train,DB.data['test_variants_filter']],axis=0)
            test = DB.data['stage2_test_variants']
        lbl_encode(train,test)
        n = max(train['Gene'].max(),test['Gene'].max())
        gtr = onehot_encode(train['Gene'].values,n=n+1)
        gte = onehot_encode(test['Gene'].values)
        return gtr,gte
    else:
        data = DB.data['training_variants']
        lbl_encode(data,cols=['Gene'])
        gene = data['Gene'].values
        gene = onehot_encode(gene)
        return gene[tr],gene[te]

xgb.py 文件源码项目：kaggle-review 作者: daxiongshu 项目源码文件源码阅读 51 收藏 0 点赞 0 评论 0

def post_cv(flags):
    import re
    import os
    path = flags.data_path
    files = [i for i in os.listdir(path) if len(re.findall('cv_[0-9].csv',i))]
    s = []
    for name in files:
        s.append(pd.read_csv("%s/%s"%(path,name)))

    s = pd.concat(s,axis=0)
    print(s.head())
    classes = len([i for i in s.columns.values if 'class' in i])
    from utils.np_utils.utils import cross_entropy
    yp = s[['class%d'%i for i in range(1,classes+1)]].values
    y=s['real'].values
    print(cross_entropy(y,yp))
    s.to_csv("%s/cv.csv"%path,index=False)

gos.py 文件源码项目：GOS 作者: crcresearch 项目源码文件源码阅读 52 收藏 0 点赞 0 评论 0

def create_agents(self, generator):
        """
        Given information on a set of countries and a generator function,
        generate the agents and assign the results to ``self.agents``.

        :type generator: DataFrame, str, int
        :param generator: A function which generates the agents.
        """
        self.generator = generator
        country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
        country_array.index = range(len(country_array))
        # Garbage collect before creating new processes.
        gc.collect()
        self.agents = pd.concat(
            self.pool.imap(self._gen_agents,
                           np.array_split(country_array, self.processes * self.splits))
        )
        self.agents.index = range(len(self.agents))

gos.py 文件源码项目：GOS 作者: crcresearch 项目源码文件源码阅读 64 收藏 0 点赞 0 评论 0

def create_agents(self, generator):
        """
        Given information on a set of countries and a generator function,
        generate the agents and assign the results to ``self.agents``.

        :type generator: DataFrame, str, int
        :param generator: A function which generates the agents.
        """
        self.generator = generator
        country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
        country_array.index = range(len(country_array))
        # Garbage collect before creating new processes.
        gc.collect()
        self.agents = pd.concat(
            self.pool.imap(self._gen_agents,
                           np.array_split(country_array, self.processes * self.splits))
        )
        self.agents.index = range(len(self.agents))

GBREncoding.py 文件源码项目：kaggle 作者: RankingAI 项目源码文件源码阅读 44 收藏 0 点赞 0 评论 0

def OHETr(self, tr):
        """"""
        OHEDict = {}
        for col in tr.columns:
            ValueCounts = [str(int(v)) for v in tr[col].value_counts().index.values]
            ValueCounts.append('missing')
            SelectedValues = dict((k, v) for (v, k) in enumerate(ValueCounts, start=0))
            OHTr = self.__ApplyOH(tr[col].values, SelectedValues)

            headers = dict((('%s_%s' % (col, k)), SelectedValues[k]) for k in SelectedValues)
            tmp = [v[0] for v in sorted(headers.items(), key=lambda x: x[1])]
            OHDFTr = pd.DataFrame(OHTr, index=tr.index, columns=tmp)

            tr = pd.concat([tr, OHDFTr], axis=1)

            tr.drop(col, axis=1, inplace=True)
            OHEDict[col] = SelectedValues
            #print('Column %s was encoded.' % col)

        return tr, OHEDict

helperFuncs.py 文件源码项目：PersonalizedMultitaskLearning 作者: mitmedialab 项目源码文件源码阅读 46 收藏 0 点赞 0 评论 0

def combineFilesIntoDf(file_path, filenames, reset_index=False, drop_cols=None):
    df = None
    for filename in filenames:
        fdf = pd.DataFrame.from_csv(file_path + filename)

        if reset_index:
            fdf = fdf.reset_index()

        if df is None:
            df = fdf.copy(deep=True)
        else:
            df = pd.concat([df,fdf])

    if drop_cols is not None:
        for feat in drop_cols:
            df = df.drop(feat, 1)

    return df

recipe_clustering.py 文件源码项目：Flavor-Network 作者: lingcheng99 项目源码文件源码阅读 55 收藏 0 点赞 0 评论 0

def tsne_cluster_cuisine(df,sublist):
    lenlist=[0]
    df_sub = df[df['cuisine']==sublist[0]]
    lenlist.append(df_sub.shape[0])
    for cuisine in sublist[1:]:
        temp = df[df['cuisine']==cuisine]
        df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True)
        lenlist.append(df_sub.shape[0])
    df_X = df_sub.drop(['cuisine','recipeName'],axis=1)
    print df_X.shape, lenlist

    dist = squareform(pdist(df_X, metric='cosine'))
    tsne = TSNE(metric='precomputed').fit_transform(dist)

    palette = sns.color_palette("hls", len(sublist))
    plt.figure(figsize=(10,10))
    for i,cuisine in enumerate(sublist):
        plt.scatter(tsne[lenlist[i]:lenlist[i+1],0],\
        tsne[lenlist[i]:lenlist[i+1],1],c=palette[i],label=sublist[i])
    plt.legend()

#interactive plot with boken; set up for four categories, with color palette; pass in df for either ingredient or flavor

dataset.py 文件源码项目：sanergy-public 作者: dssg 项目源码文件源码阅读 43 收藏 0 点赞 0 评论 0

def create_future(fold, features_old, cfg_parameters):
    """
    Just for testing purposes.
    Sets up a replicate of the last day(s) data to create new data for testing. But in reality,
    we should be able to create features for the upcoming days from past data, so this would not be needed???
    """
    last_day = fold['window_end']
    next_days = [last_day + timedelta(days=i) for i in xrange(1,(cfg_parameters['prediction_horizon'] +1 ))]
    old_features_unique = features_old.drop_duplicates(subset='ToiletID')
    l_future_features = []
    for day in  next_days:
        next_day_features = old_features_unique.copy()
        next_day_features["Collection_Date"] = day
        l_future_features.append(next_day_features)
    future_features = pd.concat(l_future_features, ignore_index=True)
    return(future_features)