def varianceFilter(train_data, train_classes, threshold):
#if True:
# return frequencyFilter(train_data, train_classes, threshold)
'''
Variance filter
'''
vectorizer = DictVectorizer()
# Fit and transform the train data.
x_train = vectorizer.fit_transform(train_data)
#y_train = train_classes
sel = VarianceThreshold(threshold=(threshold * (1 - threshold)))
x_new = sel.fit_transform(x_train)
return vectorizer.inverse_transform(sel.inverse_transform(x_new))
python类DictVectorizer()的实例源码
def grid_search(estimator, data, featTypes=('BoW',), nFolds=10, random_seed=44, param_grid=()):
labels = [x.severity for x in data]
generatePrimaryFeats(data, featTypes)
featurized = []
for d in data:
instance = {}
for featname, values in d.feats.items():
# Give each feature a unique name to avoid overwriting features.
# If e.g. a concept feature has the same name as a bow word, the old code
# would overwrite one of the features.
instance.update({"{0}-{1}".format(featname, k): v for k, v in values.items()})
featurized.append(instance)
d = DictVectorizer()
x_train = d.fit_transform(featurized)
folds = cross_validation.StratifiedKFold(labels, n_folds=nFolds, shuffle=True, random_state=random_seed)
grid = GridSearchCV(estimator, param_grid=param_grid, scoring="f1", n_jobs=-1, cv=folds)
fit_grid = grid.fit(x_train, labels)
print(fit_grid.best_params_)
return fit_grid.best_params_
def get_bootstrapped_trainset(trainSet, y_train, bootstrap_data, es, estimator, th_bs):
new_train_set = list(trainSet)
new_y_train = list(y_train)
trainAndBSData = trainSet + bootstrap_data
generateDataDrivenFeats(trainSet, trainAndBSData, es)
featurized = featurize(trainAndBSData)
train_feats = [featurized[idx] for idx in range(0, len(trainSet), 1)]
test_feats = [featurized[idx] for idx in range(len(trainSet), len(trainAndBSData), 1)]
#Do feature selection on train data
train_feats = fs.runFeatureSelection(train_feats, y_train, es)
train_feats, y_train, train_bucket = ss.runSampleSelection(train_feats, y_train,[i for i in range(0, len(trainSet), 1)], es)
# calculate Inter-annotator weighting.
weights_train = getWeights(trainAndBSData, train_bucket, es.weighInterAnnot)
vectorizer = DictVectorizer()
x_train = vectorizer.fit_transform(train_feats)
x_test = vectorizer.transform(test_feats)
if es.scaleData:
min_max_scalar = MinMaxScaler()
x_train = min_max_scalar.fit_transform(x_train.toarray())
x_test = min_max_scalar.transform(x_test.toarray())
model = train(estimator, x_train, y_train, weights_train, model=None)
y_pred_prob = model.predict_proba(x_test)
for i, cur_y in enumerate(y_pred_prob):
if np.max(cur_y) > th_bs:
new_train_set.append(bootstrap_data[i])
new_y_train.append(np.argmax(cur_y))
return (new_train_set, new_y_train) #update none to confidence vector
def _vectorize(self,corpus,fit):
assert isinstance(corpus,kindred.Corpus)
matrices = []
for feature in self.chosenFeatures:
assert feature in self.featureInfo.keys()
featureFunction = self.featureInfo[feature]['func']
never_tfidf = self.featureInfo[feature]['never_tfidf']
data = featureFunction(corpus)
notEmpty = any( len(d)>0 for d in data )
if fit:
if notEmpty:
self.dictVectorizers[feature] = DictVectorizer()
if self.tfidf and not never_tfidf:
self.tfidfTransformers[feature] = TfidfTransformer()
intermediate = self.dictVectorizers[feature].fit_transform(data)
matrices.append(self.tfidfTransformers[feature].fit_transform(intermediate))
else:
matrices.append(self.dictVectorizers[feature].fit_transform(data))
else:
if feature in self.dictVectorizers:
if self.tfidf and not never_tfidf:
intermediate = self.dictVectorizers[feature].transform(data)
matrices.append(self.tfidfTransformers[feature].transform(intermediate))
else:
matrices.append(self.dictVectorizers[feature].transform(data))
mergedMatrix = hstack(matrices)
return mergedMatrix
def ohEncoding(data, cols, replace=False):
vec = DictVectorizer()
mkdict = lambda row: dict((col, row[col]) for col in cols)
vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
vecData.columns = vec.get_feature_names()
vecData.index = data.index
if replace is True:
data = data.drop(cols, axis=1)
data = data.join(vecData)
return data, vecData, vec
def ohEncoding(data, cols, replace=False):
vec = DictVectorizer()
mkdict = lambda row: dict((col, row[col]) for col in cols)
vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
vecData.columns = vec.get_feature_names()
vecData.index = data.index
if replace is True:
data = data.drop(cols, axis=1)
data = data.join(vecData)
return data, vecData, vec
def ohEncoding(data, cols, replace=False):
vec = DictVectorizer()
mkdict = lambda row: dict((col, row[col]) for col in cols)
vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
vecData.columns = vec.get_feature_names()
vecData.index = data.index
if replace is True:
data = data.drop(cols, axis=1)
data = data.join(vecData)
return data, vecData, vec
def __init__(self, name, warm_start=True):
self.vocal = DictVectorizer()
self.model = linear_model.LogisticRegression(warm_start=warm_start,
solver='sag',
max_iter=200,
verbose=0,
penalty='l2',
n_jobs=4)
def get_pipeline(clf=RandomForestClassifier(n_estimators=100, class_weight="balanced")):
return make_pipeline(DictVectorizer(sparse=False), clf)
def predict_function():
x_list = []
predict_doc = joblib.load('logreg.pkl')
y_train, x_train = get_feature()
for line in x_train:
x_list.append(dict(Counter(line)))
X = DictVectorizer().fit_transform(x_list)
pred = predict_doc.predict(X)
prob = predict_doc.predict_proba(X)
return pred, y_train, prob
def log_regression():
x_list = []
logreg = LogisticRegression()
y_train, x_train = get_feature()
for line in x_train:
x_list.append(dict(Counter(line)))
word_vec = DictVectorizer()
X = word_vec.fit_transform(x_list)
logreg.fit(X, y_train)
joblib.dump(logreg, 'logreg.pkl')
joblib.dump(word_vec,"word_vec.pkl")
def cv_prediction(feature_dict, feature, polarity, threshold, folds):
accuracy = 0
precision = 0
recall = 0
f1 = 0
count = 0
dicvec = DictVectorizer()
LR = LogisticRegression()
kfold = KFold(len(polarity), n_folds=folds)
for train, test in kfold:
count += 1
x = list()
y = list()
[x.append(feature[i]) for i in train]
[y.append(polarity[i]) for i in train]
x.append(feature_dict)
y.append(0)
LR.fit(dicvec.fit_transform(x), y)
test_label = list()
answer_label = list()
[answer_label.append(polarity[j]) for j in test]
for j in test:
query = fit_feature(feature[j], feature_dict)
result = -1 if query.shape[1] != len(feature_dict) else prediction(LR, query, threshold)
test_label.append(result)
accuracy += accuracy_score(answer_label, test_label)
precision += precision_score(answer_label, test_label)
recall += recall_score(answer_label, test_label)
f1 += f1_score(answer_label, test_label)
print('{}_fold finished.'.format(count))
return accuracy, precision, recall, f1
def predict_function():
x_list = []
predict_doc = joblib.load('logreg.pkl')
y_train, x_train = get_feature()
for line in x_train:
x_list.append(dict(Counter(line)))
X = DictVectorizer().fit_transform(x_list)
pred = predict_doc.predict(X)
prob = predict_doc.predict_proba(X)
return pred, y_train, prob
def log_regression():
x_list = []
logreg = LogisticRegression()
y_train, x_train = get_feature()
for line in x_train:
x_list.append(dict(Counter(line)))
word_vec = DictVectorizer()
X = word_vec.fit_transform(x_list)
logreg.fit(X, y_train)
joblib.dump(logreg, 'logreg.pkl')
joblib.dump(word_vec,"word_vec.pkl")
def main():
lr = joblib.load('./lr.pkl')
dic2vec = DictVectorizer()
features = list()
y = list()
for line in open('sentiment.txt'):
word_list = line[3:].strip('\n').strip().split()
features.append(getFeature(word_list))
x = dic2vec.fit_transform(features)
with open('sentiment_prediction.txt', 'w') as fp:
for sentiment, prob in zip(lr.predict(x), lr.predict_proba(x)):
print('{}\t{}'.format(sentiment, prob), file=fp)
custom_transformers.py 文件源码
项目:pandas-pipelines-custom-transformers
作者: jem1031
项目源码
文件源码
阅读 27
收藏 0
点赞 0
评论 0
def fit(self, X, y=None):
# assumes all columns of X are strings
Xdict = X.to_dict('records')
self.dv = DictVectorizer(sparse=False)
self.dv.fit(Xdict)
return self
def test_dictvectorizer(self):
D = [{"foo": 1, "bar": 3},
{"bar": 4, "baz": 2},
{"bar": 1, "quux": 1, "quuux": 2}]
for sparse in (True, False):
for dtype in (int, np.float32, np.int16):
for sort in (True, False):
v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
v = v.fit(D)
self._test_conversion(D, v)
def test_unseen_or_no_features(self):
D1 = [{"camelot": 0, "spamalot": 1}]
D2 = [{}, {"nothing" : 21}]
for sparse in (True, False):
for dtype in (int, np.float32, np.int16):
for sort in (True, False):
v = DictVectorizer(sparse=sparse, dtype=dtype, sort=sort)
v = v.fit(D1)
self._test_conversion(D2, v)
def test_int_features_in_pipeline(self):
import numpy.random as rn
import pandas as pd
rn.seed(0)
x_train_dict = [ dict( (rn.randint(100), 1)
for i in range(20))
for j in range(100)]
y_train = [0,1]*50
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
pl = Pipeline([("dv", DictVectorizer()), ("lm", LogisticRegression())])
pl.fit(x_train_dict, y_train)
import coremltools
model = coremltools.converters.sklearn.convert(pl, input_features = "features", output_feature_names = "target")
x = pd.DataFrame( {"features" : x_train_dict,
"prediction" : pl.predict(x_train_dict)})
cur_eval_metics = evaluate_classifier(model, x)
self.assertEquals(cur_eval_metics['num_errors'], 0)
def _validate_input_col_descriptions(self):
found_output_column = False
self.cols_to_ignore = []
expected_vals = set(['categorical', 'text', 'nlp'])
for key, value in self.column_descriptions.items():
value = value.lower()
self.column_descriptions[key] = value
if value == 'output':
self.output_column = key
found_output_column = True
elif value == 'date':
self.date_cols.append(key)
elif value == 'ignore':
self.cols_to_ignore.append(key)
elif value in expected_vals:
pass
else:
raise ValueError('We are not sure how to process this column of data: ' + str(value) + '. Please pass in "output", "categorical", "ignore", "nlp", or "date".')
if found_output_column is False:
print('Here is the column_descriptions that was passed in:')
print(self.column_descriptions)
raise ValueError('In your column_descriptions, please make sure exactly one column has the value "output", which is the value we will be training models to predict.')
# We will be adding one new categorical variable for each date col
# Be sure to add it here so the rest of the pipeline knows to handle it as a categorical column
for date_col in self.date_cols:
self.column_descriptions[date_col + '_day_part'] = 'categorical'
self.cols_to_ignore = set(self.cols_to_ignore)
# We use _construct_pipeline at both the start and end of our training.
# At the start, it constructs the pipeline from scratch
# At the end, it takes FeatureSelection out after we've used it to restrict DictVectorizer, and adds final_model back in if we did grid search on it