def data_preprocess(train,test):
outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
train.drop(train.index[outlier_idx],inplace=True)
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
test.loc[:,'MSSubClass':'SaleCondition']))
to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
all_data = all_data.drop(to_delete,axis=1)
train["SalePrice"] = np.log1p(train["SalePrice"])
#log transform skewed numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
return X_train,X_test,y
python类concat()的实例源码
def data_preprocess(train,test):
outlier_idx = [4,11,13,20,46,66,70,167,178,185,199, 224,261, 309,313,318, 349,412,423,440,454,477,478, 523,540, 581,588,595,654,688, 691, 774, 798, 875, 898,926,970,987,1027,1109, 1169,1182,1239, 1256,1298,1324,1353,1359,1405,1442,1447]
train.drop(train.index[outlier_idx],inplace=True)
all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
test.loc[:,'MSSubClass':'SaleCondition']))
to_delete = ['Alley','FireplaceQu','PoolQC','Fence','MiscFeature']
all_data = all_data.drop(to_delete,axis=1)
train["SalePrice"] = np.log1p(train["SalePrice"])
#log transform skewed numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
return X_train,X_test,y
def data_preprocess(train, test):
outlier_idx = [4, 11, 13, 20, 46, 66, 70, 167, 178, 185, 199, 224, 261, 309, 313, 318, 349, 412, 423, 440, 454, 477,
478, 523, 540, 581, 588, 595, 654, 688, 691, 774, 798, 875, 898, 926, 970, 987, 1027, 1109, 1169,
1182, 1239, 1256, 1298, 1324, 1353, 1359, 1405, 1442, 1447]
train.drop(train.index[outlier_idx], inplace=True)
all_data = pd.concat((train.loc[:, 'MSSubClass':'SaleCondition'],
test.loc[:, 'MSSubClass':'SaleCondition']))
to_delete = ['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']
all_data = all_data.drop(to_delete, axis=1)
train["SalePrice"] = np.log1p(train["SalePrice"])
# log transform skewed numeric features
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) # compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index
all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(method='ffill')
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
return X_train, X_test, y
def do_work_pso(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
output = pd.DataFrame(population[item].position)
output.columns = ['Split']
dataSplit = pd.concat([data, output], axis=1)
f1 = []
results = []
for i in range(nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
# print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_ga(item, nclusters, data, LVcsv, Mcsv, scheme, reg, h, maximo, population):
output = pd.DataFrame(population[item].genes)
output.columns = ['Split']
dataSplit = pd.concat([data, output], axis=1)
f1 = []
results = []
for i in range(nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
return (1 / np.sum(f1))
# Main
def do_work_pso(data, LVcsv, Mcsv, scheme, reg, h, maximo):
output = pd.DataFrame(population[item].position)
output.columns = ['Split']
dataSplit = pd.concat([data, output], axis=1)
f1 = []
results = []
for i in range(nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, LVcsv, Mcsv, scheme,
reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_ga(self, item):
output = pd.DataFrame(self.population[item].genes)
output.columns = ['Split']
dataSplit = pd.concat([self.data, output], axis=1)
f1 = []
results = []
for i in range(self.nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
self.reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def do_work_pso(self, item):
output = pd.DataFrame(self.population[item].position)
output.columns = ['Split']
dataSplit = pd.concat([self.data, output], axis=1)
f1 = []
results = []
for i in range(self.nclusters):
dataSplited = (dataSplit.loc[dataSplit['Split']
== i]).drop('Split', axis=1)
dataSplited.index = range(len(dataSplited))
try:
results.append(PyLSpm(dataSplited, self.LVcsv, self.Mcsv, self.scheme,
self.reg, 0, 50, HOC='true'))
resid = results[i].residuals()[3]
f1.append(resid)
except:
f1.append(10000)
print((1 / np.sum(f1)))
return (1 / np.sum(f1))
def merged(self, s, t):
chars = []
for c1, c2 in zip_longest(s.sequence, t.sequence):
if c1 is None:
c = c2
elif c2 is None:
c = c1
elif c1 == 'N':
c = c2
elif c2 == 'N':
c = c1
elif c1 != c2:
return None
else:
assert c1 == c2
c = c1
chars.append(c)
seq = ''.join(chars)
requested = s.requested or t.requested
name = s.name + ';' + t.name
# take union of groups
group = pd.concat([s.group, t.group]).groupby(level=0).last()
return SiblingInfo(seq, requested, name, group)
def update_dividends(self, new_dividends):
"""
Update our dividend frame with new dividends. @new_dividends should be
a DataFrame with columns containing at least the entries in
zipline.protocol.DIVIDEND_FIELDS.
"""
# Mark each new dividend with a unique integer id. This ensures that
# we can differentiate dividends whose date/sid fields are otherwise
# identical.
new_dividends['id'] = np.arange(
self._dividend_count,
self._dividend_count + len(new_dividends),
)
self._dividend_count += len(new_dividends)
self.dividend_frame = sort_values(pd.concat(
[self.dividend_frame, new_dividends]
), ['pay_date', 'ex_date']).set_index('id', drop=False)
def pipeline_event_loader_args(self, dates):
_, mapping = super(
BlazeCashBuybackAuthLoaderTestCase,
self,
).pipeline_event_loader_args(dates)
return (bz.data(pd.concat(
pd.DataFrame({
BUYBACK_ANNOUNCEMENT_FIELD_NAME:
frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME],
CASH_FIELD_NAME:
frame[CASH_FIELD_NAME],
TS_FIELD_NAME:
frame[TS_FIELD_NAME],
SID_FIELD_NAME: sid,
})
for sid, frame in iteritems(mapping)
).reset_index(drop=True)),)
def pipeline_event_loader_args(self, dates):
_, mapping = super(
BlazeShareBuybackAuthLoaderTestCase,
self,
).pipeline_event_loader_args(dates)
return (bz.data(pd.concat(
pd.DataFrame({
BUYBACK_ANNOUNCEMENT_FIELD_NAME:
frame[BUYBACK_ANNOUNCEMENT_FIELD_NAME],
SHARE_COUNT_FIELD_NAME:
frame[SHARE_COUNT_FIELD_NAME],
TS_FIELD_NAME:
frame[TS_FIELD_NAME],
SID_FIELD_NAME: sid,
})
for sid, frame in iteritems(mapping)
).reset_index(drop=True)),)
def load_names_data():
fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
if not os.path.exists(fp):
r = requests.get(URL_NAMES)
with open(fp, 'wb') as f:
f.write(r.content)
post = collections.OrderedDict()
with zipfile.ZipFile(fp) as zf:
# get ZipInfo instances
for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
fn = zi.filename
if fn.startswith('yob'):
year = int(fn[3:7])
df = pd.read_csv(
zf.open(zi),
header=None,
names=('name', 'gender', 'count'))
df['year'] = year
post[year] = df
df = pd.concat(post.values())
df.set_index('name', inplace=True, drop=True)
return df
def load_names_data():
fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
if not os.path.exists(fp):
r = requests.get(URL_NAMES)
with open(fp, 'wb') as f:
f.write(r.content)
post = collections.OrderedDict()
with zipfile.ZipFile(fp) as zf:
# get ZipInfo instances
for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
fn = zi.filename
if fn.startswith('yob'):
year = int(fn[3:7])
df = pd.read_csv(
zf.open(zi),
header=None,
names=('name', 'gender', 'count'))
df['year'] = year
post[year] = df
df = pd.concat(post.values())
df.set_index('name', inplace=True, drop=True)
return df
def read_data(fname):
""" Read football-data.co.uk csv """
data = (
pd.read_csv(fname)
.rename(columns={
'HomeTeam': 'home_team',
'AwayTeam': 'away_team',
'FTHG': 'home_goals',
'FTAG': 'away_goals'
})
.loc[lambda df: ~pd.isnull(df['home_goals'])] # Remove future games
)
team_map = stan_map(pd.concat([data['home_team'], data['away_team']]))
data['home_team_id'] = data['home_team'].replace(team_map)
data['away_team_id'] = data['away_team'].replace(team_map)
for col in ('home_goals', 'away_goals'):
data[col] = [int(c) for c in data[col]]
return data, team_map
def QA_fetch_get_security_bars(code, _type, lens, ip=best_ip['stock'], port=7709):
api = TdxHq_API()
with api.connect(ip, port):
data = pd.concat([api.to_df(api.get_security_bars(_select_type(_type), _select_market_code(
code), code, (i - 1) * 800, 800)) for i in range(1, int(lens / 800) + 2)], axis=0)
data = data\
.assign(datetime=pd.to_datetime(data['datetime']), code=str(code))\
.drop(['year', 'month', 'day', 'hour', 'minute'], axis=1, inplace=False)\
.assign(date=data['datetime'].apply(lambda x: str(x)[0:10]))\
.assign(date_stamp=data['datetime'].apply(lambda x: QA_util_date_stamp(x)))\
.assign(time_stamp=data['datetime'].apply(lambda x: QA_util_time_stamp(x)))\
.assign(type=_type).set_index('datetime', drop=False, inplace=False).tail(lens)
if data is not None:
return data
else:
return None
def QA_fetch_get_stock_block(ip=best_ip['stock'], port=7709):
'????'
api = TdxHq_API()
with api.connect(ip, port):
data = pd.concat([api.to_df(api.get_and_parse_block_info("block_gn.dat")).assign(type='gn'),
api.to_df(api.get_and_parse_block_info(
"block.dat")).assign(type='yb'),
api.to_df(api.get_and_parse_block_info(
"block_zs.dat")).assign(type='zs'),
api.to_df(api.get_and_parse_block_info("block_fg.dat")).assign(type='fg')])
if len(data) > 10:
return data.assign(source='tdx').drop(['block_type', 'code_index'], axis=1).set_index('code', drop=False, inplace=False).drop_duplicates()
else:
QA_util_log_info('Wrong with fetch block ')
def QA_fetch_get_future_day(code, start_date, end_date, level='day', ip=best_ip['future'], port=7727):
'???? ??'
apix = TdxExHq_API()
start_date = str(start_date)[0:10]
today_ = datetime.date.today()
lens = QA_util_get_trade_gap(start_date, today_)
global extension_market_info
extension_market_info=QA_fetch_get_future_list() if extension_market_info is None else extension_market_info
with apix.connect(ip, port):
code_market = extension_market_info.query('code=="{}"'.format(code))
data = pd.concat([apix.to_df(apix.get_instrument_bars(_select_type(
level), int(code_market.market), str(code),(int(lens / 700) - i) * 700, 700))for i in range(int(lens / 700) + 1)], axis=0)
data = data.assign(date=data['datetime'].apply(lambda x: str(x[0:10]))).assign(code=str(code))\
.assign(date_stamp=data['datetime'].apply(lambda x: QA_util_date_stamp(str(x)[0:10]))).set_index('date', drop=False, inplace=False)
return data.drop(['year', 'month', 'day', 'hour', 'minute', 'datetime'], axis=1)[start_date:end_date].assign(date=data['date'].apply(lambda x: str(x)[0:10]))
def QA_data_make_qfq(bfq_data, xdxr_data):
'???????????'
info = xdxr_data[xdxr_data['category'] == 1]
bfq_data['if_trade'] = 1
data = pd.concat([bfq_data, info[['category']]
[bfq_data.index[0]:bfq_data.index[-1]]], axis=1)
data['if_trade'].fillna(value=0, inplace=True)
data = data.fillna(method='ffill')
data = pd.concat([data, info[['fenhong', 'peigu', 'peigujia',
'songzhuangu']][bfq_data.index[0]:bfq_data.index[-1]]], axis=1)
data = data.fillna(0)
data['preclose'] = (data['close'].shift(1) * 10 - data['fenhong'] + data['peigu']
* data['peigujia']) / (10 + data['peigu'] + data['songzhuangu'])
data['adj'] = (data['preclose'].shift(-1) /
data['close']).fillna(1)[::-1].cumprod()
data['open'] = data['open'] * data['adj']
data['high'] = data['high'] * data['adj']
data['low'] = data['low'] * data['adj']
data['close'] = data['close'] * data['adj']
data['preclose'] = data['preclose'] * data['adj']
return data.query('if_trade==1').drop(['fenhong', 'peigu', 'peigujia', 'songzhuangu',
'if_trade', 'category'], axis=1).query("open != 0")
def QA_data_make_hfq(bfq_data, xdxr_data):
'???????????'
info = xdxr_data[xdxr_data['category'] == 1]
bfq_data['if_trade'] = 1
data = pd.concat([bfq_data, info[['category']]
[bfq_data.index[0]:bfq_data.index[-1]]], axis=1)
data['if_trade'].fillna(value=0, inplace=True)
data = data.fillna(method='ffill')
data = pd.concat([data, info[['fenhong', 'peigu', 'peigujia',
'songzhuangu']][bfq_data.index[0]:bfq_data.index[-1]]], axis=1)
data = data.fillna(0)
data['preclose'] = (data['close'].shift(1) * 10 - data['fenhong'] + data['peigu']
* data['peigujia']) / (10 + data['peigu'] + data['songzhuangu'])
data['adj'] = (data['preclose'].shift(-1) /
data['close']).fillna(1).cumprod()
data['open'] = data['open'] / data['adj']
data['high'] = data['high'] / data['adj']
data['low'] = data['low'] / data['adj']
data['close'] = data['close'] / data['adj']
data['preclose'] = data['preclose'] / data['adj']
return data.query('if_trade==1').drop(['fenhong', 'peigu', 'peigujia', 'songzhuangu'], axis=1).query("open != 0")
def get_text_len(DB, tr, te):
if tr is None:
if te=='stage1':
Data = [DB.data['training_text'],DB.data['test_text_filter']]
else:
Data = [pd.concat([DB.data['training_text'],DB.data['test_text_filter']],axis=0),DB.data['stage2_test_text']]
else:
Data = [DB.data['training_text']]
for data in Data:
data['tl'] = data['Text'].apply(lambda x:len(x))
data['tl2'] = data['Text'].apply(lambda x:len(x.split()))
if tr is None:
X,Xt = Data
return X[['tl','tl2']].values, Xt[['tl','tl2']].values
else:
X = Data[0][['tl','tl2']].values
return X[tr],X[te]
def get_pattern(DB,tr,te,patterns):
cols = ['p%d'%c for c,p in enumerate(patterns)]
if tr is None:
test = DB.data['test_variants_filter'] if te=='stage1' else DB.data['stage2_test_variants']
if te=='stage1':
train = DB.data['training_variants']
else:
train = pd.concat([DB.data['training_variants'],DB.data["test_variants_filter"]],axis=0)
Data =[train,test]
else:
Data = [DB.data['training_variants']]
for data in Data:
for c,p in enumerate(patterns):
data['p%d'%c] = data['Variation'].apply(lambda x: len(re.findall(p,str(x).lower())))
if tr is None:
return train[cols].values,test[cols].values
else:
X = data[cols].values
return X[tr],X[te]
def onehot_gene(DB, tr, te):
from utils.np_utils.encoder import onehot_encode
if tr is None:
train = DB.data['training_variants']
if te=="stage1":
test = DB.data['test_variants_filter']
else:
train = pd.concat([train,DB.data['test_variants_filter']],axis=0)
test = DB.data['stage2_test_variants']
lbl_encode(train,test)
n = max(train['Gene'].max(),test['Gene'].max())
gtr = onehot_encode(train['Gene'].values,n=n+1)
gte = onehot_encode(test['Gene'].values)
return gtr,gte
else:
data = DB.data['training_variants']
lbl_encode(data,cols=['Gene'])
gene = data['Gene'].values
gene = onehot_encode(gene)
return gene[tr],gene[te]
def post_cv(flags):
import re
import os
path = flags.data_path
files = [i for i in os.listdir(path) if len(re.findall('cv_[0-9].csv',i))]
s = []
for name in files:
s.append(pd.read_csv("%s/%s"%(path,name)))
s = pd.concat(s,axis=0)
print(s.head())
classes = len([i for i in s.columns.values if 'class' in i])
from utils.np_utils.utils import cross_entropy
yp = s[['class%d'%i for i in range(1,classes+1)]].values
y=s['real'].values
print(cross_entropy(y,yp))
s.to_csv("%s/cv.csv"%path,index=False)
def create_agents(self, generator):
"""
Given information on a set of countries and a generator function,
generate the agents and assign the results to ``self.agents``.
:type generator: DataFrame, str, int
:param generator: A function which generates the agents.
"""
self.generator = generator
country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
country_array.index = range(len(country_array))
# Garbage collect before creating new processes.
gc.collect()
self.agents = pd.concat(
self.pool.imap(self._gen_agents,
np.array_split(country_array, self.processes * self.splits))
)
self.agents.index = range(len(self.agents))
def create_agents(self, generator):
"""
Given information on a set of countries and a generator function,
generate the agents and assign the results to ``self.agents``.
:type generator: DataFrame, str, int
:param generator: A function which generates the agents.
"""
self.generator = generator
country_array = pd.concat([pd.Series([c] * k["Population"]) for c, k in self.df.iterrows()])
country_array.index = range(len(country_array))
# Garbage collect before creating new processes.
gc.collect()
self.agents = pd.concat(
self.pool.imap(self._gen_agents,
np.array_split(country_array, self.processes * self.splits))
)
self.agents.index = range(len(self.agents))
def OHETr(self, tr):
""""""
OHEDict = {}
for col in tr.columns:
ValueCounts = [str(int(v)) for v in tr[col].value_counts().index.values]
ValueCounts.append('missing')
SelectedValues = dict((k, v) for (v, k) in enumerate(ValueCounts, start=0))
OHTr = self.__ApplyOH(tr[col].values, SelectedValues)
headers = dict((('%s_%s' % (col, k)), SelectedValues[k]) for k in SelectedValues)
tmp = [v[0] for v in sorted(headers.items(), key=lambda x: x[1])]
OHDFTr = pd.DataFrame(OHTr, index=tr.index, columns=tmp)
tr = pd.concat([tr, OHDFTr], axis=1)
tr.drop(col, axis=1, inplace=True)
OHEDict[col] = SelectedValues
#print('Column %s was encoded.' % col)
return tr, OHEDict
def combineFilesIntoDf(file_path, filenames, reset_index=False, drop_cols=None):
df = None
for filename in filenames:
fdf = pd.DataFrame.from_csv(file_path + filename)
if reset_index:
fdf = fdf.reset_index()
if df is None:
df = fdf.copy(deep=True)
else:
df = pd.concat([df,fdf])
if drop_cols is not None:
for feat in drop_cols:
df = df.drop(feat, 1)
return df
def tsne_cluster_cuisine(df,sublist):
lenlist=[0]
df_sub = df[df['cuisine']==sublist[0]]
lenlist.append(df_sub.shape[0])
for cuisine in sublist[1:]:
temp = df[df['cuisine']==cuisine]
df_sub = pd.concat([df_sub, temp],axis=0,ignore_index=True)
lenlist.append(df_sub.shape[0])
df_X = df_sub.drop(['cuisine','recipeName'],axis=1)
print df_X.shape, lenlist
dist = squareform(pdist(df_X, metric='cosine'))
tsne = TSNE(metric='precomputed').fit_transform(dist)
palette = sns.color_palette("hls", len(sublist))
plt.figure(figsize=(10,10))
for i,cuisine in enumerate(sublist):
plt.scatter(tsne[lenlist[i]:lenlist[i+1],0],\
tsne[lenlist[i]:lenlist[i+1],1],c=palette[i],label=sublist[i])
plt.legend()
#interactive plot with boken; set up for four categories, with color palette; pass in df for either ingredient or flavor
def create_future(fold, features_old, cfg_parameters):
"""
Just for testing purposes.
Sets up a replicate of the last day(s) data to create new data for testing. But in reality,
we should be able to create features for the upcoming days from past data, so this would not be needed???
"""
last_day = fold['window_end']
next_days = [last_day + timedelta(days=i) for i in xrange(1,(cfg_parameters['prediction_horizon'] +1 ))]
old_features_unique = features_old.drop_duplicates(subset='ToiletID')
l_future_features = []
for day in next_days:
next_day_features = old_features_unique.copy()
next_day_features["Collection_Date"] = day
l_future_features.append(next_day_features)
future_features = pd.concat(l_future_features, ignore_index=True)
return(future_features)