def get_pipeline_builder():
pipe_builder = PipelineBuilder()
# Feature Extraction
params = {'ngram_range': [(1, 1), (1, 2), (1, 3)]}
pipe_builder.add_extractor('CountVectorizer', CountVectorizer, 'Count Vectorizer', params)
params = {}
pipe_builder.add_extractor('HashingVectorizer', HashingVectorizer, 'Hashing Vectorizer', params)
params = {}
pipe_builder.add_extractor('TfidfVectorizer', TfidfVectorizer, 'TfIdf Vectorizer', params)
# Dimension Reduction
params = {}
pipe_builder.add_reductor('No_Reduction', ModelNull, 'None', params)
params = {}
pipe_builder.add_reductor('TruncatedSVD', TruncatedSVD, 'Truncated SVD', params)
# Normalization
params = {}
pipe_builder.add_normalizer('No_Normalization', ModelNull, 'None', params)
params = {}
pipe_builder.add_normalizer('Normalizer', Normalizer, 'Normalizer', params)
# Classification Models
params = {}
pipe_builder.add_classifier('MultinomialNB', MultinomialNB, 'Multinomial Naive Bayes', params)
params = {}
pipe_builder.add_classifier('BernoulliNB', BernoulliNB, 'Bernoulli Naive Bayes', params)
params = {}
pipe_builder.add_classifier('KNeighborsClassifier', KNeighborsClassifier, 'K-Neighbors', params)
params = {}
pipe_builder.add_classifier('RadiusNeighborsClassifier', RadiusNeighborsClassifier, 'Radius Neighbors', params)
return pipe_builder
python类Normalizer()的实例源码
def make_k_means_clustering(self, short_filenames, input_texts):
output_dir = self.output_dir + 'K_MEANS/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if self.need_tf_idf:
self.signals.PrintInfo.emit("?????? TF-IDF...")
idf_filename = output_dir + 'tf_idf.csv'
msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
self.signals.PrintInfo.emit(msg)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(input_texts)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
km = KMeans(n_clusters=self.kmeans_cluster_count, init='k-means++', max_iter=100, n_init=10)
km.fit(X)
predict_result = km.predict(X)
self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
clasters_output = ''
for claster_index in range(max(predict_result) + 1):
clasters_output += ('??????? ' + str(claster_index) + ':\n')
for predict, document in zip(predict_result, short_filenames):
if predict == claster_index:
clasters_output += (' ' + str(document) + '\n')
clasters_output += '\n'
self.signals.PrintInfo.emit(clasters_output)
self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
writeStringToFile(clasters_output, output_dir + 'clusters.txt')
self.signals.PrintInfo.emit('')
self.signals.PrintInfo.emit('?????? ?????????:')
for index, cluster_center in enumerate(km.cluster_centers_):
self.signals.PrintInfo.emit(' ' + str(index) + ':' + str(cluster_center))
self.draw_clusters_plot(X, predict_result, short_filenames)
def make_dbscan_clustering(self, short_filenames, input_texts):
output_dir = self.output_dir + 'DBSCAN/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
if self.need_tf_idf:
self.signals.PrintInfo.emit("?????? TF-IDF...")
idf_filename = output_dir + 'tf_idf.csv'
msg = self.calculate_and_write_tf_idf(idf_filename, input_texts)
self.signals.PrintInfo.emit(msg)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(input_texts)
svd = TruncatedSVD(2)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
db = DBSCAN(eps=self.dbscan_eps, min_samples=self.dbscan_min_pts)
predict_result = db.fit_predict(X)
db.fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_
self.signals.PrintInfo.emit('\n??????? ?? ??????????:\n')
clasters_output = ''
for claster_index in range(max(predict_result) + 1):
clasters_output += ('??????? ' + str(claster_index) + ':\n')
for predict, document in zip(predict_result, short_filenames):
if predict == claster_index:
clasters_output += (' ' + str(document) + '\n')
clasters_output += '\n'
clasters_output += ('??????? ???????? (-1):\n')
for predict, document in zip(predict_result, short_filenames):
if predict == -1:
clasters_output += (' ' + str(document) + '\n')
clasters_output += '\n'
self.signals.PrintInfo.emit(clasters_output)
self.signals.PrintInfo.emit('????????? ?:' + str(output_dir + 'clusters.txt'))
writeStringToFile(clasters_output, output_dir + 'clusters.txt')
self.draw_clusters_plot(X, predict_result, short_filenames)
def new(stop_words=[],decomposition='SVD',n_components=5):
# Prepare vectoriser engines
idf = TfidfVectorizer(
ngram_range=(1,3), #Unigram,bigram,& trigram
stop_words=stop_words
)
# Prepare normaliser
norm = Normalizer(norm='max')
print(colored('Texthasher model created','yellow'))
# Prepare dimensionality reduction
if decomposition and n_components:
if decomposition=='LDA': # Results in Non-negative matrix
reducer = LatentDirichletAllocation( # TFIDF --> Topic term
n_topics=n_components,
max_doc_update_iter=20,
max_iter=8
)
return [idf,norm,reducer]
elif decomposition=='SVD':
reducer = TruncatedSVD( # Best for small dataset,
n_components, # nightmare for large dataset
n_iter=8) # Damn slow
return [idf,norm,reducer]
elif decomposition=='PCA':
# When using IPCA, remember to always keep:
# n_samples > n_components > batch_size
# reducer = IncrementalPCA(n_components)
# Sparse -> Dense greedily consumes large amount of mem
# to_dense = SparseToDense()
# return [idf,norm,to_dense,reducer]
reducer = SparsePCA(n_components)
return [idf,norm,reducer]
return [idf,norm]
else:
return [idf,norm]
def __init__(self, num_class=2):
"""
:type num_classes: int
:rtype: None
"""
self.__ctrl__ = None
self.__case__ = None
with open('../../.dbname', 'r') as f:
self.__DB_NAME__ = json.load(f)['dbname']
self.__MG_DOCS_COLL__ = 'raw-docs' # raw docs
self.__MG_SENTS_COLL__ = 'bag-of-sents' # raw sentences
self.__MG_TOKENS_COLL__ = 'sample-tokens' # clean tokens (words)
self.__PG_STATS_TBL__ = 'stats' # stylometric features
self.__PG_RESULTS_TBL__ = 'results_' + \
str(num_class) + \
'class' # cross val results
self.__PG_PROBAS_TBL__ = 'probabilities' # cross val probabilities
self.__model__ = Pipeline([ \
# ('scaler2', StandardScaler()),
# ('scaler', MinMaxScaler()),
# ('scaler3', Normalizer()),
('classifier', SVC(probability=True,
kernel='poly',
degree=2,
class_weight='balanced') \
if num_class-1 \
else OneClassSVM(kernel='rbf',
nu=0.7,
gamma=1./250))
])
print 'Instantiated classifier %s.' % \
self.__model__.named_steps['classifier'].__class__.__name__
self.__io__ = DBIO(MG_DB_NAME=self.__DB_NAME__,
PG_DB_NAME=self.__DB_NAME__)
self.__tagger__ = None # initialise if re-creating samples
self.__bootstrap__ = None # initialise in fit