def function_2(text):
paragraphs = text.split('\n\n')
count_vect = CountVectorizer()
bow_matrix = count_vect.fit_transform(paragraphs)
normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)
similarity_graph = normalized_matrix * normalized_matrix.T #term frequency/inverse doc frequency applied
similarity_graph.toarray()
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph) #TextRank applied
ranked = sorted(((scores[i],s) for i,s in enumerate(paragraphs)), reverse=True) #Sorts all paragraphs from highest to lowest scores
ten_percent = int(round(10.00/100.00 * len(ranked)))
ten_percent_high_scores = ranked[0:ten_percent]
summary = [x[1] for x in ten_percent_high_scores] #Takes top 10%, so the paragraphs with the highest scores (does not disturb the rank order)
return "\n\n".join(summary)
#Text taken from the user's uploaded PDF or URL, cleaned and formatted.
python类TfidfTransformer()的实例源码
papyrus_summary_extraction_tool.py 文件源码
项目:Papyrus--simple-but-effective-text-summarization-tool
作者: RebeccaMerrett
项目源码
文件源码
阅读 25
收藏 0
点赞 0
评论 0
def removeSimilarSentences(generatedSentences, originalSentences, stopwords,threshold=0.80,):
docs=[]
for sent, sim in generatedSentences:
docs.append(sent)
docs.extend(originalSentences)
bow_matrix = StemmedTfidfVectorizer(stop_words=stopwords).fit_transform(docs)
normalized = TfidfTransformer().fit_transform(bow_matrix)
#simMatrix = (normalized[0:] * normalized[0:].T).A
simindices=[]
#print 'Num original, ', len(originalSentences)
for i in xrange(len(generatedSentences)):
simGeneratedScores = linear_kernel(normalized[i], normalized[len(generatedSentences):]).flatten()
if(max(simGeneratedScores) >= threshold):
simindices.append(i)
#print simindices
finalGen=[sentence for k,sentence in enumerate(generatedSentences) if k not in simindices]
#print len(generatedSentences), len(finalGen)
return finalGen
twenty_news_group.py 文件源码
项目:DataScience-And-MachineLearning-Handbook-For-Coders
作者: wxyyxc1992
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def extract_feature(self):
"""
???????????
"""
# ?????????-???
self.train_dtm = self.count_vect.fit_transform(self.data['train'].data)
# ????? TF ??
tf_transformer = TfidfTransformer(use_idf=False)
self.train_tf = tf_transformer.transform(self.train_dtm)
# ????? TF-IDF ??
tfidf_transformer = TfidfTransformer().fit(self.train_dtm)
self.train_tfidf = tf_transformer.transform(self.train_dtm)
def feature(terms):
dataMatrix = np.genfromtxt(finaltest, delimiter='|', dtype=None, skip_header=True)
n = dataMatrix.size
l = len(terms)
occurence = np.zeros((n, l), dtype=np.int)
d = 0
for row in dataMatrix:
temp = row[0].lower().decode('UTF-8').split(' ')
for i in range(l):
if terms[i] in temp:
occurence[d][i] += 1
d += 1
transformer = TfidfTransformer()
tfdif = transformer.fit_transform(occurence)
occurence = tfdif.toarray()
np.savetxt('occurencetest.csv',occurence,delimiter=',')
return occurence, dataMatrix
def word_unigrams():
preprocessor = TextCleaner(lowercase=True,
filter_urls=True,
filter_mentions=True,
filter_hashtags=True,
alphabetic=True,
strip_accents=True,
filter_rt=True)
vectorizer = CountVectorizer(min_df=2,
stop_words=get_stopwords(),
preprocessor=preprocessor,
ngram_range=(1, 1))
pipeline = Pipeline([('vect', vectorizer),
('tfidf', TfidfTransformer(sublinear_tf=True)),
('scale', Normalizer())])
return ('word_unigrams', pipeline)
def Training_model():
#????????????
f = open("f://emotion/mysite/weibo_emotion/emotion_file/data_count.txt") # ???????????
f.readline() # ????
data = np.loadtxt(f)
#?????????
f1 = open("f://emotion/mysite/weibo_emotion/emotion_file/data_jixing.txt")
leibie = np.loadtxt(f1)
f.close()
f1.close()
#TF-IDF??
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(data)
data1 = tfidf.toarray()
#SVM?????
clf = svm.SVC() # class
clf.fit(data1, leibie) # training the svc model
return clf
def test_main(self):
categories, documents = get_docs_categories()
clean_function = lambda text: '' if text.startswith('[') else text
entity_types = set(['GPE'])
term_doc_mat = (
TermDocMatrixFactory(
category_text_iter=zip(categories, documents),
clean_function=clean_function,
nlp=_testing_nlp,
feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types)
).build()
)
clf = PassiveAggressiveClassifier(n_iter=5, C=0.5, n_jobs=-1, random_state=0)
fdc = FeatsFromDoc(term_doc_mat._term_idx_store,
clean_function=clean_function,
feats_from_spacy_doc=FeatsFromSpacyDoc(
entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
tfidf = TfidfTransformer(norm='l1')
X = tfidf.fit_transform(term_doc_mat._X)
clf.fit(X, term_doc_mat._y)
X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
pred = clf.predict(tfidf.transform(X_to_predict))
dec = clf.decision_function(X_to_predict)
def get_logistic_regression_coefs_l2(self, category,
clf=RidgeClassifierCV()):
''' Computes l2-penalized logistic regression score.
Parameters
----------
category : str
category name to score
category : str
category name to score
Returns
-------
(coefficient array, accuracy, majority class baseline accuracy)
'''
from sklearn.cross_validation import cross_val_predict
y = self._get_mask_from_category(category)
X = TfidfTransformer().fit_transform(self._X)
clf.fit(X, y)
y_hat = cross_val_predict(clf, X, y)
acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat)
return clf.coef_[0], acc, baseline
def get_logistic_regression_coefs_l1(self, category,
clf=LassoCV(alphas=[0.1, 0.001],
max_iter=10000,
n_jobs=-1)):
''' Computes l1-penalized logistic regression score.
Parameters
----------
category : str
category name to score
Returns
-------
(coefficient array, accuracy, majority class baseline accuracy)
'''
from sklearn.cross_validation import cross_val_predict
y = self._get_mask_from_category(category)
y_continuous = self._get_continuous_version_boolean_y(y)
# X = TfidfTransformer().fit_transform(self._X)
X = self._X
clf.fit(X, y_continuous)
y_hat = (cross_val_predict(clf, X, y_continuous) > 0)
acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat)
clf.fit(X, y_continuous)
return clf.coef_, acc, baseline
def getTFIDF():
"""
:return:
"""
corpus,textList=getFenCiWords();
vectorizer=CountVectorizer()#??????????????????????a[i][j] ??j??i???????
transformer=TfidfTransformer()#??????????tf-idf??
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#???fit_transform???tf-idf????fit_transform??????????
word=vectorizer.get_feature_names()#????????????
weight = tfidf.toarray() # ?tf-idf?????????a[i][j]??j??i?????tf-idf??
print "?" + str(len(weight)) + "???" + ",?" + str(len(word)) + "??"
return weight, textList
# for i in range(len(weight)):#???????tf-idf????????for??????????for?????????????
# print u"-------?????",i,u"??????tf-idf??------"
# for j in range(len(word)):
# print word[j],weight[i][j]
def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
super(ScikitRE, self).__init__()
self.modelname = relationtype + "_" + modelname
self.relationtype = relationtype
self.pairtype = relationtype
self.corpus = corpus
self.pairs = []
self.features = []
self.labels = []
self.pred = []
self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
self.generate_data(corpus, modelname, relationtype)
self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
#('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
#('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
#('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
#('clf', SGDClassifier())
#('clf', svm.NuSVC(nu=0.01 ))
#('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
('clf', MultinomialNB(alpha=0.01, fit_prior=False))
#('clf', DummyClassifier(strategy="constant", constant=True))
])
def tfidf_feature(xtrain, xtest, stopwords_path):
"""
tf-idf feature
"""
xtrain = [" ".join(word) for word in xtrain]
xtest = [" ".join(word) for word in xtest]
stopwords = codecs.open(stopwords_path, 'r', encoding='utf-8').readlines()
stopwords = [word.strip("\n") for word in stopwords]
vectorizer_train = CountVectorizer(analyzer='word', stop_words=stopwords,min_df=5)
count_train = vectorizer_train.fit_transform(xtrain)
vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_)
count_test = vectorizer_test.fit_transform(xtest)
transformer = TfidfTransformer()
tfidf_train = transformer.fit(count_train).transform(count_train)
tfidf_test = transformer.fit(count_test).transform(count_test)
return tfidf_train.toarray(),tfidf_test.toarray()
def getModels(self):
with open(self.data_path + '/categories.pkl', 'rb') as f:
categories = cPickle.load(f)
with open(self.data_path + '/category_map.pkl', 'rb') as f:
category_map = cPickle.load(f)
with open(self.data_path + '/article_classifier_model.pkl', 'rb') as f:
clf = cPickle.load(f)
count_vect = CountVectorizer()
with open(self.data_path + '/count_vect.pkl', 'rb') as f:
count_vect = cPickle.load(f)
tfidf_transformer = TfidfTransformer()
with open(self.data_path + '/tfidf_transformer.pkl', 'rb') as f:
tfidf_transformer = cPickle.load(f)
with open(self.data_path + '/tree.pkl', 'rb') as f:
tree = cPickle.load(f)
return categories, category_map, clf, count_vect, tfidf_transformer, tree
def get_topic_idf(self, sentences):
vectorizer = CountVectorizer()
sent_word_matrix = vectorizer.fit_transform(sentences)
transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False)
tfidf = transformer.fit_transform(sent_word_matrix)
tfidf = tfidf.toarray()
centroid_vector = tfidf.sum(0)
centroid_vector = np.divide(centroid_vector, centroid_vector.max())
# print(centroid_vector.max())
feature_names = vectorizer.get_feature_names()
word_list = []
for i in range(centroid_vector.shape[0]):
if centroid_vector[i] > self.topic_threshold:
# print(feature_names[i], centroid_vector[i])
word_list.append(feature_names[i])
return word_list
def test_tf_idf_smoothing():
X = [[1, 1, 1],
[1, 1, 0],
[1, 0, 0]]
tr = TfidfTransformer(smooth_idf=True, norm='l2')
tfidf = tr.fit_transform(X).toarray()
assert_true((tfidf >= 0).all())
# check normalization
assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])
# this is robust to features with only zeros
X = [[1, 1, 0],
[1, 1, 0],
[1, 0, 0]]
tr = TfidfTransformer(smooth_idf=True, norm='l2')
tfidf = tr.fit_transform(X).toarray()
assert_true((tfidf >= 0).all())
def _vectorize(self,corpus,fit):
assert isinstance(corpus,kindred.Corpus)
matrices = []
for feature in self.chosenFeatures:
assert feature in self.featureInfo.keys()
featureFunction = self.featureInfo[feature]['func']
never_tfidf = self.featureInfo[feature]['never_tfidf']
data = featureFunction(corpus)
notEmpty = any( len(d)>0 for d in data )
if fit:
if notEmpty:
self.dictVectorizers[feature] = DictVectorizer()
if self.tfidf and not never_tfidf:
self.tfidfTransformers[feature] = TfidfTransformer()
intermediate = self.dictVectorizers[feature].fit_transform(data)
matrices.append(self.tfidfTransformers[feature].fit_transform(intermediate))
else:
matrices.append(self.dictVectorizers[feature].fit_transform(data))
else:
if feature in self.dictVectorizers:
if self.tfidf and not never_tfidf:
intermediate = self.dictVectorizers[feature].transform(data)
matrices.append(self.tfidfTransformers[feature].transform(intermediate))
else:
matrices.append(self.dictVectorizers[feature].transform(data))
mergedMatrix = hstack(matrices)
return mergedMatrix
def __init__(self, texts, ids, vocabulary=None, encoding='utf-8'):
"""Definido en la declaracion de la clase.
Attributes:
texts (list of str): Textos a clasificar.
ids (list of str): Identificadores únicos para cada texto (debe
tener la misma longitud que `texts`).
vocabulary (list): Opcional. Vocabulario a tener en cuenta para la
vectorización de los textos. Default: usa todas las palabras
presentes en los textos, salvo los ES_stopwords.txt.
encoding (str): Codificación de los textos en `texts` y en `ids`.
"""
this_dir, this_filename = os.path.split(__file__)
es_stopwords = pd.read_csv(os.path.join(this_dir, 'ES_stopwords.txt'),
header=None, encoding='utf-8')
es_stopwords = list(np.squeeze(es_stopwords.values))
self._check_id_length(ids)
self.vectorizer = CountVectorizer(
input='content', encoding=encoding, decode_error='strict',
strip_accents='ascii', lowercase=True, preprocessor=None,
tokenizer=None, stop_words=es_stopwords, ngram_range=(1, 1),
analyzer='word', max_df=0.8, min_df=1, max_features=None,
vocabulary=vocabulary, binary=False)
self.transformer = TfidfTransformer()
self.ids = None # Matiene una lista ordenada de ids de textos.
self.term_mat = None # Matriz que cuenta los terminos en un texto.
self.tfidf_mat = None # Matriz de relevancia de los terminos.
self.reload_texts(texts, ids)
def case1():
from sklearn import datasets
news = datasets.fetch_20newsgroups(subset='all')
# print len(news.data)
# print len(news.target)
# print '*'*10
# print news.data[0]
# print '*'*10
# print news.target[0]
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
vec = CountVectorizer()
x = vec.fit_transform(news.data)
# print x.shape
# print x[:2]
print x[:10,:10].toarray()
TFIDF = TfidfTransformer()
x_tfidf = TFIDF.fit_transform(x)
print x_tfidf[:10,:10].toarray()
from sklearn.cross_validation import train_test_split
Xtrain, Xtest, ytrain,ytest =train_test_split(x,news.target,test_size = 0.3,random_state=233)
tf_Xtrain, tf_Xtest, tf_ytrain,tf_ytest =train_test_split(x_tfidf,news.target,test_size = 0.3,random_state=233)
from sklearn.naive_bayes import MultinomialNB
mnb =MultinomialNB()
tf_mnb = MultinomialNB()
mmb.fit(Xtrain,ytrain)
tf_mnb.fit(tf_Xtrain,tf_ytrain)
def fit(self, dataset, filename):
self.logger.debug("fit")
self.clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
])
self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
joblib.dump(self.clf, filename + ".pkl", compress=9)
def fit(self, dataset, filename):
self.logger.debug("fit")
self.clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB())
])
self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
joblib.dump(self.clf, filename + ".pkl", compress=9)
def cvectorize(f, c, n):
r"""Use the Count Vectorizer and TF-IDF Transformer.
Parameters
----------
f : pandas.DataFrame
Dataframe containing the column ``c``.
c : str
Name of the text column in the dataframe ``f``.
n : int
The number of n-grams.
Returns
-------
new_features : sparse matrix
The transformed features.
References
----------
To use count vectorization and TF-IDF, you can find more
information here [TFE]_.
.. [TFE] http://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction
"""
fc = f[c]
fc.fillna(BSEP, inplace=True)
cvect = CountVectorizer(ngram_range=[1, n], analyzer='char')
cfeat = cvect.fit_transform(fc)
tfidf_transformer = TfidfTransformer()
new_features = tfidf_transformer.fit_transform(cfeat).toarray()
return new_features
#
# Function apply_treatment
#
def __init__(self, norm='l2', use_idf=True, smooth_idf=True,
sublinear_tf=False, **kwargs):
self.tfidf = TfidfTransformer(norm=norm, use_idf=use_idf,
smooth_idf=smooth_idf,
sublinear_tf=sublinear_tf)
# override defaults since we need the counts here
self.verbose = kwargs.get('verbose', 0)
binary = kwargs.pop('binary', False)
dtype = kwargs.pop('dtype', np.int64)
# pass remaining args to countvectorizer
self._init_params(name="TFIDF", binary=binary, dtype=dtype, **kwargs)
twenty_news_group.py 文件源码
项目:DataScience-And-MachineLearning-Handbook-For-Coders
作者: wxyyxc1992
项目源码
文件源码
阅读 23
收藏 0
点赞 0
评论 0
def predict(self, docs):
"""
???????????
"""
X_new_counts = self.count_vect.transform(docs)
tfidf_transformer = TfidfTransformer().fit(X_new_counts)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
return self.clf.predict(X_new_tfidf)
def normalize(counts):
transformer = TfidfTransformer(smooth_idf=1)
return transformer.fit_transform(counts).toarray()
def main():
sys.stdout = codecs.getwriter('utf8')(sys.stdout.buffer)
parser = argparse.ArgumentParser(description='')
parser.add_argument('-i', '--input', help='Input file', required=True)
parser.add_argument('-t', '--test', help='Test file', required=True)
parser.add_argument('-o', '--output', help='Output filename prefix', required=True)
parser.add_argument('-c', '--c', help='C value for SVM', type=float, default=1.0)
parser.add_argument('-k', '--k', help='Number of features to keep', type=int, default=1000)
args = parser.parse_args()
data = read_semeval_regression(args.input, encoding='windows-1252')
analyzer = get_rich_analyzer(word_ngrams=[2, 3], char_ngrams=[4])
pipeline = Pipeline([
('vect', CountVectorizer(analyzer=analyzer)),
('tfidf', TfidfTransformer()),
('sel', SelectKBest(chi2, k=args.k)),
('clf', BinaryTreeRegressor(base_estimator=LinearSVC(C=args.c), verbose=False)),
])
test = read_test_data(args.test, encoding='windows-1252')
regressor = pipeline.fit(data[0], data[1])
y = regressor.predict(test[2])
with open('%sc%f-k%i-C.output' % (args.output, args.c, args.k), 'w', encoding='utf8') as outfile:
for id_, topic, rate in zip(test[0], test[1], y):
print(id_, topic, rate, sep='\t', file=outfile)
def fit(self, dataset, filename):
self.logger.debug("fit")
self.clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', MultinomialNB())
])
self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
joblib.dump(self.clf, filename + ".pkl", compress=9)
def fit(self, dataset, filename):
self.logger.debug("fit")
self.clf = Pipeline([('vect', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
])
self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
joblib.dump(self.clf, filename + ".pkl", compress=9)
def feature():
global termcount
dataMatrix = np.genfromtxt(finaltrial, delimiter='|', dtype=None, skip_header=True)
terms = []
n = dataMatrix.size
for row in dataMatrix:
row[0] = row[0].lower().decode('UTF-8')
temp = row[0].decode('UTF-8').replace(' ', '+')
temp = (get.urlopen("http://localhost:5095/parser?sentence=" + temp).read()).decode('UTF-8')
terms.extend([x.split('/')[0] for x in temp.split(' ') if
x.split('/')[1] == 'JJ' or x.split('/')[1].startswith('VB')])
tfidf(temp)
s = sum(list(termcount.values()))
termcount = {x: (y * 100 / s) for x, y in zip(termcount.keys(), termcount.values())}
# terms.extend([x for x in termcount.keys()])
terms = list(set(terms))
stop = open('stop.csv', 'r').read().splitlines()
terms = [x for x in terms if x not in stop]
l = len(terms)
occurence = np.zeros((n, l), dtype=np.int)
d = 0
for row in dataMatrix:
temp = row[0].decode('UTF-8').split(' ')
for i in range(l):
if terms[i] in temp:
occurence[d][i] += 1
d += 1
transformer = TfidfTransformer()
tfdif = transformer.fit_transform(occurence)
occurence = tfdif.toarray()
np.savetxt('occurence.csv',occurence,delimiter=',')
return occurence, dataMatrix, terms
def avg_spelling_error(lang=None):
pipeline = Pipeline([('feature', SpellingError(language=lang)),
('tfidf', TfidfTransformer(sublinear_tf=False)),
('scale', Normalizer())])
return ('avg_spelling_error', pipeline)
def punctuation_features():
pipeline = Pipeline([('feature', PunctuationFeatures()),
('tfidf', TfidfTransformer(sublinear_tf=False)),
('scale', Normalizer())])
return ('punctuation_features', pipeline)