def precalculate_factors(self, df):
"""Precomputes the buckets and labels for the Jackknife object.
Args:
df: A pandas dataframe.
"""
if self._unit is None:
self._buckets = np.arange(len(df))
self._bucket_labels = np.arange(len(df))
else:
self._buckets, names = pd.factorize(df[self._unit])
self._bucket_labels = np.arange(len(names))
python类factorize()的实例源码
def precalculate_factors(self, df):
"""Initializes the labels for the Bootstrap object.
Args:
df: A pandas dataframe.
"""
if self._unit is not None:
self._values, labels = pd.factorize(df[self._unit])
self._labels = [self._values == ii for ii in range(len(labels))]
def precalculate_factors(self, data, sort=True):
"""Initializes the factor variable.
Args:
data: A pandas dataframe.
sort: Boolean indicating whether or not the conditions should be sorted.
Raises:
ValueError: The baseline key isn't found.
"""
self.factors, condition_keys = pd.factorize(data[self.condition_column],
sort=sort)
self.alternate_indices = [
ii for ii, label in enumerate(condition_keys)
if self._include_base or label != self.baseline_key
]
self.alternate_keys = condition_keys[self.alternate_indices]
if any(condition_keys == self.baseline_key):
self.baseline_index = np.where(condition_keys == self.baseline_key)[0][0]
else:
raise ValueError("Baseline value {} not present in column {}".format(
self.baseline_key, self.condition_column))
self._baseline_mask = (self.factors == self.baseline_index)
self._alternate_masks = {}
for ii in self.alternate_indices:
self._alternate_masks[ii] = (self.factors == ii)
def __init__(self, metric, dimensions, name=None):
"""Initializes distribution estimator.
Args:
metric: Thing to calculate
dimensions: Dimensions to distribute things over.
name: A string for the column name of results.
"""
def _calculate(data, weights):
"""Calculates distribution metric."""
total = 1.0 * _weighted_sum(data[metric].values, weights)
dimension_tuples = pd.lib.fast_zip([data[ii].values for ii in dimensions])
factors, keys = pd.factorize(dimension_tuples)
results = np.zeros(len(keys))
for ii in xrange(len(keys)):
results[ii] = _weighted_sum(data[metric].values,
weights * (factors == ii)) / total
output = pd.DataFrame(results,
index=pd.MultiIndex.from_tuples(keys,
names=dimensions),
columns=[""])
return output
if name is None:
name = "{} Distribution".format(metric)
super(Distribution, self).__init__(name, _calculate, "dataframe")
def __init__(self, metric, dimensions, ascending=True, name=None):
"""Initializes distribution estimator.
Args:
metric: Thing to calculate
dimensions: Dimensions to distribute things over.
ascending: list of bools to pass to pandas.sort_index that say to sort
each dimension ascending or descending.
name: A string for the column name of results.
"""
def _calculate(data, weights):
"""Calculates cumulative distribution metric."""
total = 1.0 * _weighted_sum(data[metric].values, weights)
dimension_tuples = pd.lib.fast_zip([data[ii].values for ii in dimensions])
factors, keys = pd.factorize(dimension_tuples, sort=True)
results = np.zeros(len(keys))
for ii in xrange(len(keys)):
results[ii] = _weighted_sum(data[metric].values,
weights * (factors == ii)) / total
output = pd.DataFrame(results,
index=pd.MultiIndex.from_tuples(keys,
names=dimensions),
columns=[""])
output = output.sort_index(ascending=ascending).cumsum()
return output
if name is None:
name = "{} Cumulative Distribution".format(metric)
super(CumulativeDistribution, self).__init__(name, _calculate, "dataframe")
def factorize(train, test, features, na_value=-9999, full=False, sort=True):
"""Factorize categorical features.
Parameters
----------
train : pd.DataFrame
test : pd.DataFrame
features : list
Column names in the DataFrame to be encoded.
na_value : int, default -9999
full : bool, default False
Whether use all columns from train/test or only from train.
sort : bool, default True
Sort by values.
Returns
-------
train : pd.DataFrame
test : pd.DataFrame
"""
for column in features:
if full:
vs = pd.concat([train[column], test[column]])
labels, indexer = pd.factorize(vs, sort=sort)
else:
labels, indexer = pd.factorize(train[column], sort=sort)
train[column] = indexer.get_indexer(train[column])
test[column] = indexer.get_indexer(test[column])
if na_value != -1:
train[column] = train[column].replace(-1, na_value)
test[column] = test[column].replace(-1, na_value)
return train, test
def model_data(data, LECAT=False, NAMEAN=False, NA999=False, OH=False, ONLYCONT=False, ONLYCAT=False, ONLYCATOH=False, COLSREMOVAL=False, cols=[], maxCategories=300):
data = data.copy()
cat_var = list(data.select_dtypes(["object"]).columns)
cont_var = list(data.select_dtypes(["float", "int"]).columns)
if COLSREMOVAL:
data = data.drop(cols, 1, inplace=False)
cat_var = list(data.select_dtypes(["object"]).columns)
cont_var = list(data.select_dtypes(["float", "int"]).columns)
if NAMEAN:
for col in cont_var:
data.loc[data[col].isnull(), col] = data[col].mean()
if NA999:
for col in cont_var:
data.loc[data[col].isnull(), col] = -999
if LECAT:
for col in data[cat_var]: data[col] = pd.factorize(data[col])[0]
if OH:
cols2dummy = [col for col in data[cat_var] if len(data[col].unique()) <= maxCategories]
colsNot2dummy = [col for col in data[cat_var] if len(data[col].unique()) > maxCategories]
data = pd.get_dummies(data, dummy_na=True, columns=cols2dummy)
#binning
for col in colsNot2dummy:
data[col] = pd.factorize(data[col])[0]
dcb = DummycolumnsBins(cols=col, prefix=col, nb_bins=2000)
dcb.fit(data)
pd_binned = dcb.transform(data)
data = pd.concat([data,pd_binned],1)
if ONLYCONT:
data = data[cont_var]
if ONLYCAT:
test_idx = data['ID']
Y = data['target']
data = data[cat_var]
data['ID'] = test_idx
data['target'] = Y
if ONLYCATOH:
test_idx = data['ID']
Y = data['target']
cols = list(set(data.columns).difference(set(cont_var))) ; print(cols)
data = data[cols]
data['ID'] = test_idx
data['target'] = Y
return data
def Load_data():
train = pd.read_csv(path_train)
test = pd.read_csv(path_test)
# combine train and test
data_comb = train.append(test)
# Found at https://www.kaggle.com/marcellonegro/prudential-life-insurance-assessment/xgb-offset0501/run/137585/code
# create any new variables
data_comb['Product_Info_2_char'] = data_comb.Product_Info_2.str[0]
data_comb['Product_Info_2_num'] = data_comb.Product_Info_2.str[1]
# factorize categorical variables
data_comb['Product_Info_2'] = pd.factorize(data_comb['Product_Info_2'])[0]
data_comb['Product_Info_2_char'] = pd.factorize(data_comb['Product_Info_2_char'])[0]
data_comb['Product_Info_2_num'] = pd.factorize(data_comb['Product_Info_2_num'])[0]
data_comb['BMI_Age'] = data_comb['BMI'] * data_comb['Ins_Age']
med_keyword_columns = data_comb.columns[data_comb.columns.str.startswith('Medical_Keyword_')]
data_comb['Med_Keywords_Count'] = data_comb[med_keyword_columns].sum(axis=1)
print('Encode missing values')
data_comb.fillna(-1, inplace=True)
# fix the dtype on the label column
data_comb['Response'] = data_comb['Response'].astype(int)
# split train and test
train = data_comb[data_comb['Response']>0].copy()
test = data_comb[data_comb['Response']<1].copy()
target = train['Response'].values
le = preprocessing.LabelEncoder()
y = le.fit_transform(target)
train.drop(['Id', 'Response', 'Medical_History_10','Medical_History_24'], axis=1, inplace=True)
test.drop(['Id', 'Response', 'Medical_History_10','Medical_History_24'], axis=1, inplace=True)
train = train.as_matrix()
test = test.as_matrix()
print('Construct labels for bumping')
num_class = len(np.unique(target))
labels = np.zeros(shape=(train.shape[0],num_class-1))
labels[:, 0][target==1]=1
labels[:, 6][target<8]=1
for i in range(1, num_class-2):
labels[:, i][target<i+2]=1
return train, test, target, labels