def get_vectorizer(self, ngram_range=(1, 3), min_df=2, max_df=1.0):
"""
Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency
:param ngram_range: n-grams are created for all numbers within this range
:param min_df: min document frequency of features
:param max_df: max document frequency of features
:return:
"""
if self.is_weight == 'FP':#Feature Presence
vectorizer = CountVectorizer(ngram_range=ngram_range,
tokenizer=self.tokenize,
min_df=min_df,
max_df=max_df,
binary=True,
stop_words='english')
if self.is_weight == 'TF-IDF':#Feature Presence
vectorizer = TfidfVectorizer(ngram_range=ngram_range,
tokenizer=self.tokenize,
min_df=min_df,
max_df=max_df,
binary=True,
stop_words='english')
return vectorizer
python类CountVectorizer()的实例源码
def getTFV(token_pattern = token_pattern,
norm = tfidf__norm,
max_df = tfidf__max_df,
min_df = tfidf__min_df,
ngram_range = (1, 1),
vocabulary = None,
stop_words = 'english'):
tfv =TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=None,
strip_accents='unicode', analyzer='word',
token_pattern=token_pattern,
ngram_range=ngram_range, use_idf=True,
smooth_idf=True, sublinear_tf=True,
stop_words = stop_words, norm=norm, vocabulary=vocabulary)
return tfv
#========= CountVectorizer =========#
papyrus_summary_extraction_tool.py 文件源码
项目:Papyrus--simple-but-effective-text-summarization-tool
作者: RebeccaMerrett
项目源码
文件源码
阅读 20
收藏 0
点赞 0
评论 0
def function_2(text):
paragraphs = text.split('\n\n')
count_vect = CountVectorizer()
bow_matrix = count_vect.fit_transform(paragraphs)
normalized_matrix = TfidfTransformer().fit_transform(bow_matrix)
similarity_graph = normalized_matrix * normalized_matrix.T #term frequency/inverse doc frequency applied
similarity_graph.toarray()
nx_graph = nx.from_scipy_sparse_matrix(similarity_graph)
scores = nx.pagerank(nx_graph) #TextRank applied
ranked = sorted(((scores[i],s) for i,s in enumerate(paragraphs)), reverse=True) #Sorts all paragraphs from highest to lowest scores
ten_percent = int(round(10.00/100.00 * len(ranked)))
ten_percent_high_scores = ranked[0:ten_percent]
summary = [x[1] for x in ten_percent_high_scores] #Takes top 10%, so the paragraphs with the highest scores (does not disturb the rank order)
return "\n\n".join(summary)
#Text taken from the user's uploaded PDF or URL, cleaned and formatted.
def getBOW(token_pattern = token_pattern,
max_df = bow__max_df,
min_df = bow__min_df,
ngram_range = (1, 1),
vocabulary = None,
stop_words = 'english'):
bow =CountVectorizer(min_df=min_df, max_df=max_df, max_features=None,
strip_accents='unicode', analyzer='word',
token_pattern=token_pattern,
ngram_range=ngram_range,
stop_words = stop_words, vocabulary=vocabulary)
return bow
########################################################
# ------------------------------
# Simple text cleaning using
#
# -replacement dict
#
# or
#
# -WordReplacer object
#--------------------------------
def predict_job(job_list):
"""Assign a classification to a url"""
# TODO: Add case where len is 1 or 0....
job_list = [job for j in job_list for job in j]
new_job_list = [regex.tokenize_and_stem(i) for i in job_list]
new_job_list = [' '.join(job) for job in new_job_list]
vect = CountVectorizer()
x_series = pd.Series(X)
X_train_dtm = vect.fit_transform(x_series)
y_train = pd.Series(y)
job_list_series = pd.Series(new_job_list)
job_list_dtm = vect.transform(job_list_series)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred = nb.predict(job_list_dtm)
# for i in range(len(job_list)):
# print(job_list[i], y_pred[i])
return y_pred
# print(predict_job([('Founder',), ('Founder',), ('Architect & Full-stack developer',), ('Senior Engineer',), ('Technical Consultant',)]))
def run(self):
all_file_names = []
all_labels = []
for n, folder_name in enumerate(os.listdir(self.in_txtdir().path)):
full_folder_name = self.in_txtdir().path+'/'+folder_name
if os.path.isfile(full_folder_name):
continue
for file_name in os.listdir(full_folder_name):
all_labels.append(n)
all_file_names.append(full_folder_name+'/'+file_name)
vectorizer = CountVectorizer(input='filename')
vector = vectorizer.fit_transform(all_file_names)
numpy.save(self.out_npy().path,vector)
numpy.save('labels',numpy.array(all_labels)) #Where and how do we want to save this?
#This is just to test the tasks above
def gen_lstm_status(screen_name, timeline, short_url, depth):
# Create a vector of words and their frequency in on the user's timeline.
# Experimentation shows that requiring a word to occur at least 4 * depth
# times to be considered gives good results.
with open("stopwords.txt", 'r') as stopwords_file:
stopwords = [line.strip() for line in stopwords_file]
processed_timeline_text = [preprocess_post(post) for post in timeline]
vectorizer = CountVectorizer(min_df=4*depth, stop_words=stopwords)
X = vectorizer.fit_transform(processed_timeline_text)
vocab = vectorizer.get_feature_names()
topic = random.choice(vocab)
# Generates a status using a helper bash script.
proc = subprocess.Popen([NN_SAMPLE_COMMAND, topic], stdout=subprocess.PIPE)
status = topic + " " + proc.stdout.read().split("\n")[-2].strip()
return "@" + screen_name + " " + status + " " + short_url
def count_features(self,X,verbose=False):
'''
???????????????????????????
X?dataframe??columns????self.columns?
???????????self.estimators_??????????dataframe?index?X?columns?self.columns????????
'''
result=[]
for i,estimator in enumerate(self.estimators_):
tmp=pd.Series(estimator.apply(X[self.columns]))
tmp.index=X.index
tmp=tmp.map(lambda xx: ' '.join([yy[0] for yy in self.paths[i][xx]]))
vect=CountVectorizer(vocabulary=self.columns,lowercase=False)
tmp=vect.transform(tmp).toarray()
tmp=pd.DataFrame(tmp)
vocabulary_inverse={vect.vocabulary_[key]:key for key in vect.vocabulary_}
tmp.columns=[vocabulary_inverse[k] for k in range(tmp.shape[1])]
tmp.index=X.index
tmp.index.name=X.index.name
tmp=tmp.fillna(0)
result.append(tmp.copy())
if verbose:
print('Done:',i)
return result
def textToTokens(text):
"""Converts input string to a corpus of tokenized sentences.
Assumes that the sentences are divided by newlines (but will ignore empty sentences).
You can use this to try out your own datasets, but is not needed for reading the homework data.
"""
corpus = []
sents = text.split("\n")
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
count_vect.fit(sents)
tokenizer = count_vect.build_tokenizer()
for s in sents:
toks = tokenizer(s)
if len(toks) > 0:
corpus.append(toks)
return corpus
def _vectorize_documents(self,method='tfidf',max_features=100):
stop_words = []
try:
for lexicon_id in self.params['cluster_lexicons']:
lexicon = Lexicon.objects.get(id=int(lexicon_id))
words = Word.objects.filter(lexicon=lexicon)
stop_words+=[word.wrd for word in words]
except:
KeyError
if method == 'count':
vectorizer = CountVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)
if method == 'tfidf':
vectorizer = TfidfVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)
document_vectors = vectorizer.fit_transform(self.documents)
document_vectors = document_vectors.toarray()
return document_vectors,vectorizer.get_feature_names()
def __init__(self, match_fn=TermMatch, binary=True, dtype=np.bool_,
**cv_params):
"""initializes a Matching object
:match_fn: A matching function of signature `docs, query`
-> indices of matching docs
:binary: Store only binary term occurrences.
:dtype: Data type of internal feature matrix
:cv_params: Parameter for the count vectorizer such as lowercase=True
"""
# RetrievalBase.__init__(self)
self._match_fn = match_fn
self._vect = CountVectorizer(binary=binary, dtype=dtype,
**cv_params)
def is_embedded(sentence, embedding, analyzer):
"""
>>> embedding = ["a", "b", "c"]
>>> queries = ["a b c", "a", "b", "c", "a b c d", "d", "a b c" ]
>>> analyzer = lambda x: x.split()
>>> [query for query in queries if is_embedded(query, embedding, analyzer)]
['a b c', 'a', 'b', 'c', 'a b c']
>>> analyzer = CountVectorizer().build_analyzer()
>>> [query for query in queries if is_embedded(query, embedding, analyzer)]
['a b c', 'a', 'b', 'c', 'a b c']
"""
for word in analyzer(sentence):
if word not in embedding:
print("Dropping:", sentence, file=sys.stderr)
return False
return True
def get_vectorizer(self, ngram_range=(1, 3), min_df=2, max_df=1.0):
"""
Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency
:param ngram_range: n-grams are created for all numbers within this range
:param min_df: min document frequency of features
:param max_df: max document frequency of features
:return:
"""
if self.is_weight == 'FP':#Feature Presence
vectorizer = CountVectorizer(ngram_range=ngram_range,
tokenizer=self.tokenize,
min_df=min_df,
max_df=max_df,
binary=True,
stop_words='english')
if self.is_weight == 'TF-IDF':#Feature Presence
vectorizer = TfidfVectorizer(ngram_range=ngram_range,
tokenizer=self.tokenize,
min_df=min_df,
max_df=max_df,
binary=True,
stop_words='english')
return vectorizer
def train_feature_finder(self, training_db, clf):
training_sentences = []
c = 0
training_classes = []
self.class_names = []
self.vectorizer = CountVectorizer(analyzer = "word", \
tokenizer = None, \
preprocessor = None, \
stop_words = None, \
max_features = 500)
for key, value in training_db.iteritems():
training_sentences += value
training_classes += [c for i in range(len(value))]
c+=1
self.class_names.append(key)
train_data_features = self.vectorizer.fit_transform(training_sentences)
train_data_features = train_data_features.toarray()
clf = clf.fit( train_data_features, training_classes)
return clf
def getDatas(dataset_dir_name):
movie_reviews = load_files(dataset_dir_name)
doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(movie_reviews.data, movie_reviews.target, test_size = 0.2, random_state = 0)
#word_tokenizer ??????????????????????????????????????????????????
vectorizer = CountVectorizer(binary = True, decode_error = u'ignore')
word_tokenizer = vectorizer.build_tokenizer()
#????????list
doc_terms_list_train = list(getChList(doc_str) for doc_str in doc_str_list_train)
doc_terms_list_test = list(getChList(doc_str) for doc_str in doc_str_list_test)
return vectorizer, doc_str_list_train, doc_str_list_test,doc_class_list_train, doc_class_list_test, doc_terms_list_train
def run():
py2neo.authenticate("localhost:7474","neo4j","neo4j1")
graph = Graph("http://localhost:7474/db/data/")
result=graph.data('''MATCH (n:Product)-[r:BELONGS_TO]->(c:Category) WITH n, rand() AS number RETURN n.name,n.description,n.catName order by number limit 3000''')
st = ""
for x in result:
p=','.join(str(val).strip(string.punctuation) for (key,val) in x.items())
st=st + p
p=""
vectorizer = CountVectorizer(strip_accents='ascii')
tokenizer = vectorizer.build_tokenizer()
preprocessor = vectorizer.build_preprocessor()
tokens = set()
for item in tokenizer(st):
tokens.add(preprocessor(item))
with codecs.open(path_config.PERSONAL_WORD_DICTIONARY_FILE, mode='wb', encoding='utf-8') as f:
for token in tokens:
f.write(token + '\n')
def word_unigrams():
preprocessor = TextCleaner(lowercase=True,
filter_urls=True,
filter_mentions=True,
filter_hashtags=True,
alphabetic=True,
strip_accents=True,
filter_rt=True)
vectorizer = CountVectorizer(min_df=2,
stop_words=get_stopwords(),
preprocessor=preprocessor,
ngram_range=(1, 1))
pipeline = Pipeline([('vect', vectorizer),
('tfidf', TfidfTransformer(sublinear_tf=True)),
('scale', Normalizer())])
return ('word_unigrams', pipeline)
def get_data():
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
vectorizer = CountVectorizer()
categories = ['alt.atheism', 'talk.religion.misc',
'comp.graphics', 'sci.space']
# Train set
newsgroups_train = fetch_20newsgroups(subset='train',
categories=categories, shuffle=True)
X_train = vectorizer.fit_transform(newsgroups_train.data)
y_train = newsgroups_train.target
# Test set
newsgroups_test = fetch_20newsgroups(subset='test',
categories=categories, shuffle=True)
X_test = vectorizer.transform(newsgroups_test.data)
y_test = newsgroups_test.target
return X_train, y_train, X_test, y_test
def process(self, df, x_name, y_name=None, ngrams=2, max_features=35000, method='counts', binary=True, sparse=False):
#choosing the particular flavor of vectorizer
if method == 'counts':
vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace', binary=binary)
elif method == 'tfidf':
vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace')
#fitting the vectorizer and converting the counts to an array
full_fit = vectorizer.fit_transform(df[x_name])
full_counts = full_fit.toarray()
self.vocabulary_ = vectorizer.vocabulary_
#passing the attributes up to the class instance
self.data = df
if sparse:
full_counts = csr_matrix(full_counts)
self.X = full_counts
if y_name != None:
self.y = np.array(df[y_name])
return
#splits the data into training and test sets; either called from process()
#or on its own when your text is already vectorized and divided into x and y
def new(n_feature=128):
vectorizer = CountVectorizer(
encoding='utf-8',
ngram_range=(1,1), # Unigram only
max_features=n_feature,
binary=True
)
# Fill the gap (missing expected tags)
# ---
# Hypothesis: Some tags are somehow related so
# we smoothen the missing values with matrix factorisation.
smoother = NMF(n_components=n_feature)
# Binarise the vector's individual values
binariser = Binarizer(copy=True)
# Count vectoriser => NMF as smoother => Binariser
print(colored('Taghasher model created','yellow'))
return [vectorizer,smoother,binariser]
def bag_of_words(messages, model=None, weighting=''):
# TODO: Add stemmming or baseform here
messages, stemmings2baseform = texttools.stemming_messages_snowball(messages)
# Create new model for extrating text features if None is given
if model is None:
if weighting == 'tfidf':
model = TfidfVectorizer()
else:
model = CountVectorizer()
model.fit(messages)
# Extract features
x = model.transform(messages)
return x
def test_build(self):
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
count_vectorizer = CountVectorizer()
X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
corpus = CorpusFromScikit(
X=X_counts,
y=newsgroups_train.target,
feature_vocabulary=count_vectorizer.vocabulary_,
category_names=newsgroups_train.target_names,
raw_texts=newsgroups_train.data
).build()
self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
self.assertEqual(corpus
.get_term_freq_df()
.assign(score=corpus.get_scaled_f_scores('alt.atheism'))
.sort_values(by='score', ascending=False).index.tolist()[:5],
['atheism', 'atheists', 'islam', 'atheist', 'belief'])
self.assertGreater(len(corpus.get_texts()[0]), 5)
def test_build(self):
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
count_vectorizer = CountVectorizer()
X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
term_doc_mat = TermDocMatrixFromScikit(
X=X_counts,
y=newsgroups_train.target,
feature_vocabulary=count_vectorizer.vocabulary_,
category_names=newsgroups_train.target_names).build()
self.assertEqual(term_doc_mat.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
self.assertEqual(term_doc_mat
.get_term_freq_df()
.assign(score=term_doc_mat.get_scaled_f_scores('alt.atheism'))
.sort_values(by='score', ascending=False).index.tolist()[:5],
['atheism', 'atheists', 'islam', 'atheist', 'belief'])
def make_lda(self, nt, iterations):
# '''
# description: sets important attributes and creates lda model
# params: nt-number of topics for lda
# iterations: number of iterations for lda
# dim: 2d or 3d grpah
# threshold: minimum percentage of the maximum topic in a document which can be included in a "cluster"
# '''
self.nt = nt
self.cvectorizer = CountVectorizer(min_df=5, stop_words='english')
cvz = self.cvectorizer.fit_transform(self.descriptions)
# train an LDA model
self.lda_model = lda.LDA(n_topics=nt, n_iter=iterations)
self.X_topics_original = self.lda_model.fit_transform(cvz)
#initialize current stuff
self.X_topics_current = self.X_topics_original
self.titles_current = self.titles_original
feature_extraction.py 文件源码
项目:political-ad-classifier
作者: BoudhayanBanerjee
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def countvectorizer(inputpath=None, text=None):
"""
docstring
"""
vectorizer = CountVectorizer(min_df=1)
if inputpath:
filenames = [os.path.join(inputpath, file) for file in os.listdir(inputpath)]
corpus = []
for file in filenames:
with open(file, 'r') as f:
data = f.read()
corpus.append(data)
if text:
corpus = text
X = vectorizer.fit_transform(corpus)
print(X.toarray())
print(vectorizer.get_feature_names())
def build_feature_matrix(documents, feature_type='frequency'):
feature_type = feature_type.lower().strip()
if feature_type == 'binary':
vectorizer = CountVectorizer(binary=True, min_df=1,
ngram_range=(1, 1))
elif feature_type == 'frequency':
vectorizer = CountVectorizer(binary=False, min_df=1,
ngram_range=(1, 1))
elif feature_type == 'tfidf':
vectorizer = TfidfVectorizer(min_df=1,
ngram_range=(1, 1))
else:
raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")
feature_matrix = vectorizer.fit_transform(documents).astype(float)
return vectorizer, feature_matrix
def build_feature_matrix(documents, feature_type='frequency',
ngram_range=(1, 1), min_df=0.0, max_df=1.0):
feature_type = feature_type.lower().strip()
if feature_type == 'binary':
vectorizer = CountVectorizer(binary=True, min_df=min_df,
max_df=max_df, ngram_range=ngram_range)
elif feature_type == 'frequency':
vectorizer = CountVectorizer(binary=False, min_df=min_df,
max_df=max_df, ngram_range=ngram_range)
elif feature_type == 'tfidf':
vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df,
ngram_range=ngram_range)
else:
raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")
feature_matrix = vectorizer.fit_transform(documents).astype(float)
return vectorizer, feature_matrix
def build_feature_matrix(documents, feature_type='frequency',
ngram_range=(1, 1), min_df=0.0, max_df=1.0):
feature_type = feature_type.lower().strip()
if feature_type == 'binary':
vectorizer = CountVectorizer(binary=True, min_df=min_df,
max_df=max_df, ngram_range=ngram_range)
elif feature_type == 'frequency':
vectorizer = CountVectorizer(binary=False, min_df=min_df,
max_df=max_df, ngram_range=ngram_range)
elif feature_type == 'tfidf':
vectorizer = TfidfVectorizer(min_df=min_df, max_df=max_df,
ngram_range=ngram_range)
else:
raise Exception("Wrong feature type entered. Possible values: 'binary', 'frequency', 'tfidf'")
feature_matrix = vectorizer.fit_transform(documents).astype(float)
return vectorizer, feature_matrix
def getTFIDF():
"""
:return:
"""
corpus,textList=getFenCiWords();
vectorizer=CountVectorizer()#??????????????????????a[i][j] ??j??i???????
transformer=TfidfTransformer()#??????????tf-idf??
tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#???fit_transform???tf-idf????fit_transform??????????
word=vectorizer.get_feature_names()#????????????
weight = tfidf.toarray() # ?tf-idf?????????a[i][j]??j??i?????tf-idf??
print "?" + str(len(weight)) + "???" + ",?" + str(len(word)) + "??"
return weight, textList
# for i in range(len(weight)):#???????tf-idf????????for??????????for?????????????
# print u"-------?????",i,u"??????tf-idf??------"
# for j in range(len(word)):
# print word[j],weight[i][j]
def __init__(self, corpus, pairtype, relations, modelname="mil_classifier.model", test=False, ner="goldstandard",
generate=True):
super(MILClassifier, self).__init__()
self.modelname = modelname
self.pairtype = pairtype
self.pairs = {} # (e1.normalized, e2.normalized) => (e1, e2)
self.instances = {} # bags of instances (e1.normalized, e2.normalized) -> all instances with these two entities
self.labels = {} # (e1.normalized, e2.normalized) => label (-1/1)
self.bag_labels = [] # ordered list of labels for each bag
self.bag_pairs = [] # ordered list of pair labels (e1.normalized, e2.normalized)
self.data = [] # ordered list of bags, each is a list of feature vectors
self.predicted = [] # ordered list of predictions for each bag
self.resultsfile = None
self.examplesfile = None
self.ner_model = ner
self.vectorizer = CountVectorizer(min_df=0.2, ngram_range=(1, 1), token_pattern=r'\b\w+\-\w+\b')
self.corpus = corpus
#self.vectorizer = TfidfVectorizer(min_df=0.2, ngram_range=(1, 1), token_pattern=r'\b\w+\-\w+\b', max_features=)
#self.classifier = misvm.MISVM(kernel='linear', C=1.0, max_iters=20)
self.classifier = misvm.sMIL(kernel='linear', C=1)
#self.classifier = misvm.MissSVM(kernel='linear', C=100) #, max_iters=20)
#if generate:
# self.generateMILdata(test=test, pairtype=pairtype, relations=relations)