def onehot_encode_bar(tr,te,cols=None,bar=10000):
if cols is None:
cols = [i for i in tr.columns.values if i in te.columns.values]
vec = DictVectorizer()
cat,num = [],[]
for col in cols:
nu = tr[col].unique().shape[0]
if (nu<bar and nu>2) or tr[col].dtype=='object':
cat.append(col)
tr[col] = tr[col].map(str)
te[col] = te[col].map(str)
else:
num.append(col)
print("start fitting num of cat features:",len(cat))
X = vec.fit_transform(tr[cat].T.to_dict().values())
Xt = vec.transform(te[cat].T.to_dict().values())
print("done fitting",X.shape,Xt.shape)
X = sparse.hstack([X,tr[num].values],format='csr')
Xt = sparse.hstack([Xt,te[num].values],format='csr')
return X,Xt
python类DictVectorizer()的实例源码
def __init__(self, a_clf=None, a_grid_search=False):
"""Class constructor.
Args:
a_clf (classifier or None):
classifier to use or None for default
a_grid_search (bool): use grid search for estimating
hyper-parameters
"""
classifier = a_clf
self._gs = a_grid_search
if a_clf is None:
classifier = XGBClassifier(max_depth=MAX_DEPTH,
n_estimators=NTREES,
learning_rate=ALPHA,
objective="multi:softprob")
self._clf = classifier
# latest version of XGBoost cannot deal with non-sparse feature vectors
self._model = Pipeline([("vect", DictVectorizer()),
("clf", classifier)])
def _consolidate_pipeline(self, transformation_pipeline, final_model=None):
# First, restrict our DictVectorizer or DataFrameVectorizer
# This goes through and has DV only output the items that have passed our support mask
# This has a number of benefits: speeds up computation, reduces memory usage, and combines several transforms into a single, easy step
# It also significantly reduces the size of dv.vocabulary_ which can get quite large
dv = transformation_pipeline.named_steps['dv']
try:
feature_selection = transformation_pipeline.named_steps['feature_selection']
feature_selection_mask = feature_selection.support_mask
dv.restrict(feature_selection_mask)
except KeyError:
pass
# We have overloaded our _construct_pipeline method to work both to create a new pipeline from scratch at the start of training, and to go through a trained pipeline in exactly the same order and steps to take a dedicated FeatureSelection model out of an already trained pipeline
# In this way, we ensure that we only have to maintain a single centralized piece of logic for the correct order a pipeline should follow
trained_pipeline_without_feature_selection = self._construct_pipeline(trained_pipeline=transformation_pipeline, final_model=final_model)
return trained_pipeline_without_feature_selection
def ohEncoding(data, cols, replace=False):
if cols is None:
cols = []
for el, v in data.dtypes.iteritems():
if v == 'object':
if el == 'key':
pass
else:
cols.append(el)
print "Categorical features not set, detected as categorical: %s" % str(cols)
vec = DictVectorizer()
mkdict = lambda row: dict((col, row[col]) for col in cols)
vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
vecData.columns = vec.get_feature_names()
vecData.index = data.index
if replace is True:
data = data.drop(cols, axis=1)
data = data.join(vecData)
return data, vecData, vec
# df, t, v = ohEncoding(df, col, replace=True)
def ohEncoding(data, cols=None, replace=False):
if cols is None:
cols = []
for el, v in data.dtypes.iteritems():
if v == 'object':
cols.append(el)
print "Categorical features not set, detected as categorical: %s" % str(cols)
vec = DictVectorizer()
mkdict = lambda row: dict((col, row[col]) for col in cols)
vecData = pd.DataFrame(vec.fit_transform(data[cols].apply(mkdict, axis=1)).toarray())
vecData.columns = vec.get_feature_names()
vecData.index = data.index
if replace is True:
data = data.drop(cols, axis=1)
data = data.join(vecData)
return data, vecData, vec
def __init__(self, language='en', window_width=2, collapse_fes=True, target_size=None):
""" Initializes the extractor.
:param language: The language of the sentences that will be used
:param window_width: how many tokens to look before and after a each
token when building its features.
:param collapse_fes: Whether to collapse FEs to a single token
or to keep them split.
"""
self.language = language
self.tagger = TTPosTagger(language)
self.window_width = window_width
self.collapse_fes = collapse_fes
self.unk_feature = 'UNK'
self.vectorizer = DictVectorizer()
self.target_size = target_size
self.reducer = TruncatedSVD(target_size) if target_size else None
self.vocabulary = set()
self.label_index = {}
self.lu_index = {}
self.stopwords = set(w.lower() for w in StopWords().words(language))
self.start()
def __init__(self, a_clf=None, a_grid_search=False):
"""Class constructor.
Initialize classifier.
Args:
a_clf (classifier or None):
classifier to use or None for default
a_grid_search (bool): use grid search for estimating hyper-parameters
"""
classifier = a_clf or LinearSVC(C=DFLT_C,
**DFLT_PARAMS)
self._gs = a_grid_search
self._model = Pipeline([("vect", DictVectorizer()),
("clf", classifier)])
def _execute(self, sources, alignment_stream, interval):
time_interval = TimeInterval(MIN_DATE, interval.end)
param_doc = sources[0].window(time_interval, force_calculation=True).last()
if param_doc is None:
logging.debug("No model found in {} for time interval {}".format(sources[0].stream_id, time_interval))
return
steps = deserialise_json_pipeline({
'vectorisation': DictVectorizer(sparse=False),
'fill_missing': FillZeros(),
'classifier': LinearDiscriminantAnalysis(),
'label_encoder': LabelEncoder()
}, param_doc.value)
clf = Pipeline([(kk, steps[kk]) for kk in ('vectorisation', 'fill_missing', 'classifier')])
locations = steps['label_encoder'].classes_
data = sources[1].window(interval, force_calculation=True)
for tt, dd in data:
yield StreamInstance(tt, {locations[ii]: pp for ii, pp in enumerate(clf.predict_proba(dd)[0])})
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def predict_function():
x_list = []
line_list = []
line_dict = {}
predict_doc = joblib.load('logreg.pkl')
feature_doc = joblib.load("word_vec.pkl")
y_train, x_train = get_feature()
line = "bad bad good good"
line_list = line.split()
for line in x_train:
for key in line:
line_dict[key] = 0
line_dict.update(dict(Counter(line_list)))
for a in sorted(line_dict.items(), key = lambda x:x[1]):
print(a)
x_list.append(line_dict)
print(x_list)
exit()
X = DictVectorizer().fit_transform(x_list)
pred = predict_doc.predict(X)
prob = predict_doc.predict_proba(X)
for pred, prob in zip(pred,prob):
print(pred, prob)
def dimension_reduction():
X = PPMI_matrix()
word_list = list()
vecdict_list = list()
for word, vector in sorted(X.items()):
word_list.append(word)
vecdict_list.append(dict(vector))
Dic2Vec = DictVectorizer(sparse=True)
vector_list = Dic2Vec.fit_transform(vecdict_list)
X_svd = svds(vector_list, 300)
X_pca = np.dot(X_svd[0], np.diag(X_svd[1]))
word_matrix = dict()
for word, vector in zip(word_list, X_pca):
word_matrix[word] = vector
return word_matrix
def predict_function():
x_list = []
line_list = []
line_dict = {}
predict_doc = joblib.load('logreg.pkl')
feature_doc = joblib.load("word_vec.pkl")
y_train, x_train = get_feature()
line = "bad bad good good"
line_list = line.split()
for line in x_train:
for key in line:
line_dict[key] = 0
line_dict.update(dict(Counter(line_list)))
for a in sorted(line_dict.items(), key = lambda x:x[1]):
print(a)
x_list.append(line_dict)
print(x_list)
exit()
X = DictVectorizer().fit_transform(x_list)
pred = predict_doc.predict(X)
prob = predict_doc.predict_proba(X)
for pred, prob in zip(pred,prob):
print(pred, prob)
def dimension_compression():
X_t_c = make_matrix()
token_list = []
contexts_list = []
for token, contexts in sorted(X_t_c.items()):
token_list.append(token)
contexts_list.append(contexts)
pca = PCA(n_components = 300)
DictoVec = DictVectorizer(sparse = True)
sparse = DictoVec.fit_transform(contexts_list)
print(sparse.shape)
vec_list = pca.fit_transform(sparse.todense())
word_vec = {}
for token, vec in zip(token_list, vec_list):
word_vec[token] = vec
return word_vec
def dim_reduction():
dic2vec = DictVectorizer(sparse=True)
PPMI = getPPMI()
tc = list()
token_list = list()
for token, contexts in sorted(PPMI.items()):
token_list.append(token)
contexts = dict(contexts)
tc.append(contexts)
tc_vec = dic2vec.fit_transform(tc)
tc_svd = svds(tc_vec, 300)
tc_pca = np.dot(tc_svd[0], np.diag(tc_svd[1]))
word_vec = dict()
for token, vec in zip(token_list, tc_pca):
word_vec[token] = vec
return word_vec
def _consolidate_pipeline(self, transformation_pipeline, final_model=None):
# First, restrict our DictVectorizer or DataFrameVectorizer
# This goes through and has DV only output the items that have passed our support mask
# This has a number of benefits: speeds up computation, reduces memory usage, and combines several transforms into a single, easy step
# It also significantly reduces the size of dv.vocabulary_ which can get quite large
try:
feature_selection = transformation_pipeline.named_steps['feature_selection']
feature_selection_mask = feature_selection.support_mask
transformation_pipeline.named_steps['dv'].restrict(feature_selection_mask)
except KeyError:
pass
# We have overloaded our _construct_pipeline method to work both to create a new pipeline from scratch at the start of training, and to go through a trained pipeline in exactly the same order and steps to take a dedicated FeatureSelection model out of an already trained pipeline
# In this way, we ensure that we only have to maintain a single centralized piece of logic for the correct order a pipeline should follow
trained_pipeline_without_feature_selection = self._construct_pipeline(trained_pipeline=transformation_pipeline, final_model=final_model)
return trained_pipeline_without_feature_selection
def data2Vector(self):
vec = DictVectorizer()
dummy_x = vec.fit_transform(self.feature_list).toarray()
lb = LabelBinarizer()
dummy_y = lb.fit_transform(self.label_list)
return dummy_x, dummy_y
# here the decision tree use the algorithm which we call ID3, ID3 will use
# information gain as feature select
scikitlearn.py 文件源码
项目:PyDataLondon29-EmbarrassinglyParallelDAWithAWSLambda
作者: SignalMedia
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def transform(self, documents):
"""
Returns a dictionary of text features in advance of a DictVectorizer.
"""
for document in documents:
# Collect token and vocabulary counts
counts = Counter(
item[0] for para in document for sent in para for item in sent
)
# Yield structured information about the document
yield {
'paragraphs': len(document),
'sentences': sum(len(para) for para in document),
'words': sum(counts.values()),
'vocab': len(counts),
}
##########################################################################
## Model Building Functions
##########################################################################
def create_feature(sent_list):
feature_ = []
polarity = []
# ??????
features_ = []
# ??????
#?????
vec = DictVectorizer()
for line in sent_list:
sentence = line.strip('\n').split()
sentence2 = sentence.pop(0)
polarity.append(int(sentence2))
#print(polarity)
feature_ = feature(sentence)
'''
for word in feature(sentence):
feature_.append(word)
print(feature_)
'''
features_.append(feature_vector(feature_))
x_feature = vec.fit_transform(features_)
return x_feature, polarity
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def getFeatures(numWordsToUse, allTweets, allTweetsSentiment):
# each corpus's getFeatures function is responsible for somehow loading in their own allTweets and allTweetsSentiment data
# then they have to ensure that data is tokenized (leveraging the modular tokenization functionality in utils)
# then shuffle the dataset
# then create the frequency distribution and popularWords
# then extract features from each tweet, and un-combine the sentiment again
global popularWords
formattedTweets, sentiment, popularWords = utils.nlpFeatureEngineering(
allTweets, allTweetsSentiment,0,numWordsToUse,'counts'
)
# right now we have a data structure roughly equivalent to a dense matrix, except each row is a dictionary
# DictVectorizer performs two key functions for us:
# 1. transforms each row from a dictionary into a vector using consistent placing of keys into indexed positions within each vector
# 2. returns sparse vectors, saving enormous amounts of memory which becomes very useful when training our models
sparseFeatures = dv.fit_transform(formattedTweets)
return sparseFeatures, sentiment
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def __init__(self, estimator, dtype=float, sparse=True):
"""
:param estimator: scikit-learn classifier object.
:param dtype: data type used when building feature array.
scikit-learn estimators work exclusively on numeric data. The
default value should be fine for almost all situations.
:param sparse: Whether to use sparse matrices internally.
The estimator must support these; not all scikit-learn classifiers
do (see their respective documentation and look for "sparse
matrix"). The default value is True, since most NLP problems
involve sparse feature sets. Setting this to False may take a
great amount of memory.
:type sparse: boolean.
"""
self._clf = estimator
self._encoder = LabelEncoder()
self._vectorizer = DictVectorizer(dtype=dtype, sparse=sparse)
def test_unseen_or_no_features():
D = [{"camelot": 0, "spamalot": 1}]
for sparse in [True, False]:
v = DictVectorizer(sparse=sparse).fit(D)
X = v.transform({"push the pram a lot": 2})
if sparse:
X = X.toarray()
assert_array_equal(X, np.zeros((1, 2)))
X = v.transform({})
if sparse:
X = X.toarray()
assert_array_equal(X, np.zeros((1, 2)))
try:
v.transform([])
except ValueError as e:
assert_in("empty", str(e))
def onehot_encode(tr,te,cols=None):
if cols is None:
cols = [i for i in tr.columns.values if i in te.columns.values]
vec = DictVectorizer()
for col in cols:
tr[col] = tr[col].map(str)
te[col] = te[col].map(str)
print("start fitting")
X = vec.fit_transform(tr[cols].T.to_dict().values())
Xt = vec.transform(te[cols].T.to_dict().values())
print("done fitting",X.shape,Xt.shape)
return X,Xt
def _validate_input_col_descriptions(self):
found_output_column = False
self.cols_to_ignore = []
expected_vals = set(['categorical', 'text', 'nlp'])
for key, value in self.column_descriptions.items():
value = value.lower()
self.column_descriptions[key] = value
if value == 'output':
self.output_column = key
found_output_column = True
elif value == 'date':
self.date_cols.append(key)
elif value == 'ignore':
self.cols_to_ignore.append(key)
elif value in expected_vals:
pass
else:
raise ValueError('We are not sure how to process this column of data: ' + str(value) + '. Please pass in "output", "categorical", "ignore", "nlp", or "date".')
if found_output_column is False:
print('Here is the column_descriptions that was passed in:')
print(self.column_descriptions)
raise ValueError('In your column_descriptions, please make sure exactly one column has the value "output", which is the value we will be training models to predict.')
# We will be adding one new categorical variable for each date col
# Be sure to add it here so the rest of the pipeline knows to handle it as a categorical column
for date_col in self.date_cols:
self.column_descriptions[date_col + '_day_part'] = 'categorical'
# We use _construct_pipeline at both the start and end of our training.
# At the start, it constructs the pipeline from scratch
# At the end, it takes FeatureSelection out after we've used it to restrict DictVectorizer, and adds final_model back in if we did grid search on it
def chiSquare(train_data, train_classes, topK):
vectorizer = DictVectorizer()
# Fit and transform the train data.
x_train = vectorizer.fit_transform(train_data)
y_train = train_classes
if (x_train.shape[1] < topK):
topK = x_train.shape[1]
selector = SelectKBest(chi2, k=topK)
x_new = selector.fit_transform(x_train, y_train)
return vectorizer.inverse_transform(selector.inverse_transform(x_new))