def word_bigrams():
preprocessor = TextCleaner(lowercase=True,
filter_urls=True,
filter_mentions=True,
filter_hashtags=True,
alphabetic=True,
strip_accents=True,
filter_rt=True)
pipeline = Pipeline([('vect', CountVectorizer(preprocessor=preprocessor,
ngram_range=(2, 2))),
('tfidf', TfidfTransformer(sublinear_tf=True)),
('scale', Normalizer())])
return ('word_bigrams', pipeline)
python类TfidfTransformer()的实例源码
def char_ngrams():
vectorizer = CountVectorizer(min_df=1,
preprocessor=TextCleaner(filter_urls=True,
filter_mentions=True,
filter_hashtags=True,
lowercase=False),
analyzer='char_wb',
ngram_range=(4, 4))
pipeline = Pipeline([('vect', vectorizer),
('tfidf', TfidfTransformer(sublinear_tf=True)),
('scale', Normalizer())])
return ('char_ngrams', pipeline)
def TFIDF_result():
str_handel_list = read_handel_list() # ??30?????????????????????str
str_test = read_test_list() # ?????????????????str
# ??TF-IDF???
corpus = str_handel_list[:] # TF-IDF????
corpus.append(str_test) # ????????????
print "TF-IDF corpus building success..."
######################### ??scikit-learn?? TF-IDF????
# ??????????????????????a[i][j] ??j??i???????
vectorizer = CountVectorizer()
# ??????????tf-idf??
transformer = TfidfTransformer()
# ???fit_transform???tf-idf????fit_transform??????????
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
# ????????????
word = vectorizer.get_feature_names()
# ?tf-idf?????????a[i][j]??j??i?????tf-idf??
weight = tfidf.toarray()
print "TF-IDF score is calcuated success..."
# ???30???????????TF-IDF??
results = []
for j in range(len(word)):
if word[j] == '??' or word[j] == '??' or len(word[j]) == 1: # ??????????1??
continue
results.append((word[j], weight[30][j])) # ??????????
sorted_results = sorted(results, key=lambda result: result[1], reverse=True) # ??????
# ?TF-IDF???100????
fp_tfidf_result = open("f://emotion/mysite/Label_extract/result_tfidf.txt", 'w+')
tfidf_results = []
for i in range(100): # ???????100??????????????
tfidf_results.append((sorted_results[i][0], sorted_results[i][1]))
fp_tfidf_result.write(sorted_results[i][0] + ' ' + str(round(sorted_results[i][1], 10)))
fp_tfidf_result.write('\n')
fp_tfidf_result.close()
return tfidf_results
def _fit_tfidf_model(self, category, clf):
y = self._get_mask_from_category(category)
y_continuous = self._get_continuous_version_boolean_y(y)
X = TfidfTransformer().fit_transform(self._X)
clf.fit(X, y_continuous)
def fit_tfidf(count_vector):
'''
Fits a term frequency matrix on a count vector.
'''
tfidf_vector = TfidfTransformer(use_idf=False).fit(count_vector)
return tfidf_vector
def fit_tfidf(count_vector):
'''
Transforms a count vector into a tf vector.
TF: count vector normalized on legnth of docs.
'''
tfidf = TfidfTransformer(use_idf=False)
tfidf_vector = tfidf.fit(count_vector)
return tfidf_vector
def fit_tfidf(count_vector):
tfidf = TfidfTransformer(use_idf=False)
tfidf_vector = tfidf.fit(count_vector)
return tfidf_vector
def train_sgdc(training_list):
footnotes=[]
cate=[]
for i in training_list:
footnotes.append(i[0])
cate.append(i[1])
text_clf=Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3,n_iter=5, random_state=42)),])
_ = text_clf.fit(footnotes,cate)
return text_clf
def parseToBOW():
vectorizer = CountVectorizer(min_df=1)
texts = pickle.load(open(OUTFILE, 'rb'))[0]
tdm = vectorizer.fit_transform(texts)
transformer = TfidfTransformer()
tdidf = transformer.fit_transform(tdm)
f = open(DATASET_PATH + "BOW.p", "wb")
pickle.dump(tdm, f)
f.close()
f = open(DATASET_PATH + "BOW_TDIDF.p", "wb")
pickle.dump(tdidf, f)
f.close()
feature_extractors.py 文件源码
项目:text-analytics-with-python
作者: dipanjanS
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def tfidf_transformer(bow_matrix):
transformer = TfidfTransformer(norm='l2',
smooth_idf=True,
use_idf=True)
tfidf_matrix = transformer.fit_transform(bow_matrix)
return transformer, tfidf_matrix
def transformTFIDF(X_train_all, X_test_all):
"""Transform bag-of-events using TF-IDF.
Arguments
---------
X_train_all: pandas DataFrame
X_test_all: pandas DataFrame
Returns
-------
X_train_t: CSR matrix
X_test_t: CSR matrix
"""
tfidf_t = TfidfTransformer(norm='l2',
use_idf=True,
sublinear_tf=True,
smooth_idf=True)
X_train = scipy.sparse.csr_matrix(X_train_all)
X_test = scipy.sparse.csr_matrix(X_test_all)
# Fit TFIDF using training data.
tfidf_t.fit(X_train)
# Transform both training and test data.
X_train_t = tfidf_t.transform(X_train)
X_test_t = tfidf_t.transform(X_test)
return X_train_t, X_test_t
classifier.py 文件源码
项目:django_text_classifier
作者: django-text-classifier
项目源码
文件源码
阅读 26
收藏 0
点赞 0
评论 0
def get_pipeline(name):
x = TrainingSet.objects.filter(classifier=name).values_list('body',
flat=True)
y = TrainingSet.objects.filter(classifier=name).values_list('target',
flat=True)
pipeline = Pipeline([
('vector', CountVectorizer()),
('transform', TfidfTransformer()),
('bayes', MultinomialNB())
])
pipeline.fit(x, y)
return pipeline
def test_one_rf():
Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl")
print "training data loaded"
print_label_frequency(ytrain_raw)
############# create the pipeline
pipeline = Pipeline([
('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)),
('tfidf', TfidfTransformer()),
('rf', RandomForestClassifier(n_estimators=500,
max_depth=200,
min_samples_split=10,
oob_score=True,
n_jobs=-1,verbose=1,class_weight='balanced')),
])
############# train
pipeline.fit(Xtrain_raw,ytrain_raw)
############# check result
rf = pipeline.steps[-1][1]
rf.oob_score_
############# training error
ytrain_predict = pipeline.predict(Xtrain_raw)
print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict)
print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict)
############# testing error
Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl")
ytest_predict = pipeline.predict(Xtest_raw)
accuracy_score(y_true=ytest_raw,y_pred=ytest_predict)
print classification_report(y_true=ytest_raw,y_pred=ytest_predict)
def kmeans(class_num):
"""
kmeans ??
:param class_num: ????
:return:class_list[[??1???2],[??1???2]]
"""
class_list=list();
sentences_words,sentences=loadFile()
vectorizer = CountVectorizer() # ??????????????????????a[i][j] ??j??i???????
transformer = TfidfTransformer() # ??????????tf-idf??
# ???fit_transform???tf-idf????fit_transform??????????
#?????words_list ???["? ? ?? ???","?? ??"] ?????????????list
tfidf = transformer.fit_transform(vectorizer.fit_transform(sentences_words))
#weight ???shape=[????????] ???????
weight = tfidf.toarray() # ?tf-idf?????????a[i][j]??j??i?????tf-idf??
clf = KMeans(n_clusters=class_num)
s = clf.fit(weight)
for i in range(class_num):
class_list.append(list())
print clf.labels_
for i in range(len(clf.labels_)):#clf.labels_ ??????????[1,3,2,5,0,3,5,4,1] ???????????
class_label=clf.labels_[i]
class_list[class_label].append(sentences[i])
#print "#######?"+str(clf.labels_[i])+"?"+words_list[i]
return class_list;
def __init__(self,min_df=2,norm="l2"):
""" Constructor """
self.cv = CountVectorizer(min_df=min_df)
self.tfidf = TfidfTransformer(norm)
self.LOG_IDF = None
self.CORPUS_VOCAB = None
self.OOV_IDF_VAL = 0 #min idf value to assign for out-of-vocabulary terms
self.IDF_MODEL = dict()
def compute_query_idf(self,corpus):
""" Compute IDF from s and t in case you have no externally computed IDF to use """
cv = CountVectorizer(min_df = 0.0)
cv.fit_transform(corpus)
self.logger.debug(cv.vocabulary_)
freq_term_matrix = cv.transform(corpus)
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)
log_idf = tfidf.idf_
self.LOG_IDF = log_idf
self.CORPUS_VOCAB = cv.vocabulary_
def getTF(dataset):
tfidf = TfidfTransformer(norm=None)
tfidf.fit(dataset['train'])
return tfidf.idf_
def getTF(dataset):
tfidf = TfidfTransformer(norm=None)
tfidf.fit(dataset['train'])
return tfidf.idf_
def tfidf_pipeline(df, ngram_range, lowercase, binary, min_df=2, max_df=1.0, caps_features=False, pos_features=False, clf=LinearSVC()):
return Pipeline([
('mapper', mapper(df, ngram_range, lowercase, binary, min_df, max_df, caps_features, pos_features)),
('scaler', TfidfTransformer()),
('clf', clf),
])
def file2mat(filename):
transformer = TfidfTransformer()
vectorizer = CountVectorizer(min_df=1, ngram_range=(1,1))
data = load(filename)
reviews = [each_data['review'] for each_data in data]
bag_of_word = vectorizer.fit_transform(reviews)
tfidf = transformer.fit_transform(bag_of_word)
aspect_label = collect_aspect_label(data)
rating_label = collect_rating_label(data)
return tfidf, aspect_label, rating_label
# ??wordVec ????? ?????????