def represent(documents):
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))
train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
# Tokenization
vectorizer = TfidfVectorizer(tokenizer=tokenize)
# Learn and transform train documents
vectorised_train_documents = vectorizer.fit_transform(train_docs)
vectorised_test_documents = vectorizer.transform(test_docs)
# Transform multilabel labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id])
test_labels = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs_id])
return vectorised_train_documents, train_labels, vectorised_test_documents, test_labels
python类TfidfVectorizer()的实例源码
def get_vectorizer(self, ngram_range=(1, 3), min_df=2, max_df=1.0):
"""
Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency
:param ngram_range: n-grams are created for all numbers within this range
:param min_df: min document frequency of features
:param max_df: max document frequency of features
:return:
"""
if self.is_weight == 'FP':#Feature Presence
vectorizer = CountVectorizer(ngram_range=ngram_range,
tokenizer=self.tokenize,
min_df=min_df,
max_df=max_df,
binary=True,
stop_words='english')
if self.is_weight == 'TF-IDF':#Feature Presence
vectorizer = TfidfVectorizer(ngram_range=ngram_range,
tokenizer=self.tokenize,
min_df=min_df,
max_df=max_df,
binary=True,
stop_words='english')
return vectorizer
def getTFV(token_pattern = token_pattern,
norm = tfidf__norm,
max_df = tfidf__max_df,
min_df = tfidf__min_df,
ngram_range = (1, 1),
vocabulary = None,
stop_words = 'english'):
tfv =TfidfVectorizer(min_df=min_df, max_df=max_df, max_features=None,
strip_accents='unicode', analyzer='word',
token_pattern=token_pattern,
ngram_range=ngram_range, use_idf=True,
smooth_idf=True, sublinear_tf=True,
stop_words = stop_words, norm=norm, vocabulary=vocabulary)
return tfv
#========= CountVectorizer =========#
def __init__(self, column_descriptions=None):
self.column_descriptions = column_descriptions
self.text_col_indicators = set(['text', 'nlp'])
self.text_columns = {}
for key, val in self.column_descriptions.items():
if val in self.text_col_indicators:
self.text_columns[key] = TfidfVectorizer(
# If we have any documents that cannot be decoded properly, just ignore them and keep going as planned with everything else
decode_error='ignore'
# Try to strip accents from characters. Using unicode is slightly slower but more comprehensive than 'ascii'
, strip_accents='unicode'
# Can also choose 'character', which will likely increase accuracy, at the cost of much more space, generally
, analyzer='word'
# Remove commonly found english words ('it', 'a', 'the') which do not typically contain much signal
, stop_words='english'
# Convert all characters to lowercase
, lowercase=True
# Only consider words that appear in fewer than max_df percent of all documents
# In this case, ignore all words that appear in 90% of all documents
, max_df=0.9
# Consider only the most frequently occurring 3000 words, after taking into account all the other filtering going on
, max_features=3000
)
def fit_tfidf(self, df):
'''
Function to fit a TF-IDF matrix to a corpus of text
INPUT:
df: df with 'lemmatized_text' to analyze
'''
self.tfidf = TfidfVectorizer(input='content',
use_idf=True,
lowercase=True,
max_features=self.tfidf_max_features,
max_df=self.tfidf_max_df,
min_df=self.tfidf_min_df)
self.tfidf_matrix = self.tfidf.fit_transform(
df['lemmatized_text']).toarray()
self.tfidf_features = np.array(self.tfidf.get_feature_names())
self.tfidf_reverse_lookup = {
word: idx for idx, word in enumerate(self.tfidf_features)}
def create_vectorizer_selector(train_data, train_labels, model_file,
ngram_list=[1], max_num_features_list=[100],
analyzer_type_list=['word']):
"""Call creation and save of vectorizers and selectors including special cases.
Args:
train_data: list of train text samples
train_labels: list of train labels
model_file: model filename
ngram_list: list of ranges of n-grams
max_num_features_list: list of maximum number of features to select
analyzer_type_list: list of analyzer types for TfidfVectorizer 'word' or 'char'
Returns:
nothing
"""
for i in range(len(ngram_list)):
ngrams_selection(train_data, train_labels, 'general_' + str(i), model_file,
ngram_range_=(ngram_list[i], ngram_list[i]),
max_num_features=max_num_features_list[i],
analyzer_type=analyzer_type_list[i])
you_are_data = ngrams_you_are(train_data)
ngrams_selection(you_are_data, train_labels, 'special', model_file,
ngram_range_=(1,1), max_num_features=100)
return
def load_20ng_dataset_bow():
"""
Loads the 20NG dataset
:return:
"""
newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')
# Convert data to tf-idf
vectorizer = TfidfVectorizer(min_df=0.01, max_df=0.95)
train_data = vectorizer.fit_transform(newsgroups_train.data)
test_data = vectorizer.transform(newsgroups_test.data)
train_data = train_data.todense()
test_data = test_data.todense()
train_labels = newsgroups_train.target
test_labels = newsgroups_test.target
return train_data, train_labels, test_data, test_labels
def fit(self, X_df, y=None):
# See if we should fit TfidfVectorizer or not
for key in X_df.columns:
if key in self.text_columns:
X_df[key].fillna('nan', inplace=True)
text_col = X_df[key].astype(str, raise_on_error=False)
self.text_columns[key].fit(text_col)
col_names = self.text_columns[key].get_feature_names()
# Make weird characters play nice, or just ignore them :)
for idx, word in enumerate(col_names):
try:
col_names[idx] = str(word)
except:
col_names[idx] = 'non_ascii_word_' + str(idx)
col_names = ['nlp_' + key + '_' + str(word) for word in col_names]
self.text_columns[key].cleaned_feature_names = col_names
return self
def train(self, train_size=0.8, k_folds=5):
# retrieve data from DB and pre-process
self._get_data()
# perform train/test split
self._get_train_test_split(train_size=train_size)
# define text pre-processing pipeline
text_pipeline = Pipeline([
('extract_text', DFColumnExtractor(TEXT_FEATURES)),
('vect', TfidfVectorizer(tokenizer=twitter_tokenizer))
])
# define pipeline for pre-processing of numeric features
numeric_pipeline = Pipeline([
('extract_nums', DFColumnExtractor(NON_TEXT_FEATURES)),
('scaler', MinMaxScaler())
])
# combine both steps into a single pipeline
pipeline = Pipeline([
('features', FeatureUnion([
('text_processing', text_pipeline),
('num_processing', numeric_pipeline)
])),
('clf', self._estimator)
])
self.logger.info('Fitting model hyperparameters with {0}-fold CV'.format(k_folds))
gs = GridSearchCV(pipeline, self.params, n_jobs=-1, cv=k_folds)
X = self.data.iloc[self.train_inds_, :]
y = self.data[LABEL].values[self.train_inds_]
gs.fit(X, y)
self.logger.info('Validation set accuracy is {0}'.format(gs.best_score_))
self.gs_ = gs
self.model_ = gs.best_estimator_
def tfidf(self):
#keep both hashtags and mentions
#token_pattern=r'(?u)@?#?\b\w\w+\b'
#remove hashtags and mentions
#token_pattern = r'(?u)(?<![#@])\b\w+\b'
#just remove mentions and remove hashsign from hashtags
#token_pattern = r'(?u)(?<![@])\b\w+\b'
#remove mentions but keep hashtags with their sign
#token_pattern = r'(?u)(?<![@])#?\b\w\w+\b'
#remove multple occurrences of a character after 2 times yesss => yess
#re.sub(r"(.)\1+", r"\1\1", s)
self.vectorizer = TfidfVectorizer(tokenizer=self.tokenizer, token_pattern=self.token_pattern, use_idf=self.idf,
norm=self.norm, binary=self.btf, sublinear_tf=self.subtf,
min_df=self.mindf, max_df=self.maxdf, ngram_range=(1, 1), stop_words=self.stops,
vocabulary=self.vocab, encoding=self.encoding, dtype='float32')
logging.info(self.vectorizer)
self.X_train = self.vectorizer.fit_transform(self.df_train.text.values)
self.X_dev = self.vectorizer.transform(self.df_dev.text.values)
self.X_test = self.vectorizer.transform(self.df_test.text.values)
logging.info("training n_samples: %d, n_features: %d" % self.X_train.shape)
logging.info("development n_samples: %d, n_features: %d" % self.X_dev.shape)
logging.info("test n_samples: %d, n_features: %d" % self.X_test.shape)
def loadDataset():
'''???????'''
df = pd.read_csv('df_vec.csv')
# print df.shape
X = np.array(df.iloc[:, 1:])
y = np.array(df.iloc[:, 0])
# print y
# bet_list = list(df.iloc[:, 0])
# dataset = []
# for bet in bet_list:
# s, bet = bet.split(':')
# dataset.append(bet)
# print dataset
# print X
# print y
return X, y
# def transform(dataset, n_features=1000):
# vectorizer = TfidfVectorizer(max_df=0.5, max_features=n_features, min_df=2, use_idf=True)
# X = vectorizer.fit_transform(dataset)
# print X
# # print vectorizer
# return X, vectorizer
def get_binary(self):
return Pipeline([
('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
('feat_select', SelectPercentile(percentile=10)),
('clf', OneVsRestClassifier(SGDClassifier(alpha=0.0001,
average=False,
class_weight=None,
epsilon=0.1,
eta0=0.0,
fit_intercept=True,
l1_ratio=0.15,
learning_rate='optimal',
loss='log',
n_iter=10,
n_jobs=1,
penalty='l2',
power_t=0.5,
random_state=None,
shuffle=True,
verbose=0,
warm_start=False
)))
])
def get_sgdc(self):
return Pipeline([
('tfidf', TfidfVectorizer(stop_words=sw.words('dutch'), norm='l2', use_idf=True)),
('feat_select', SelectPercentile(percentile=10)),
('clf', SGDClassifier(alpha=0.0001,
average=False,
class_weight=None,
epsilon=0.1,
eta0=0.0,
fit_intercept=True,
l1_ratio=0.15,
learning_rate='optimal',
loss='log',
n_iter=10,
n_jobs=1,
penalty='l2',
power_t=0.5,
random_state=None,
shuffle=True,
verbose=0,
warm_start=False))
])
def get_similarity_scores(verb_token, vectorizer, tf_idf_matrix):
""" Compute the cosine similarity score of a given verb token against the input corpus TF/IDF matrix.
:param str verb_token: Surface form of a verb, e.g., *born*
:param sklearn.feature_extraction.text.TfidfVectorizer vectorizer: Vectorizer
used to transform verbs into vectors
:return: cosine similarity score
:rtype: ndarray
"""
verb_token_vector = vectorizer.transform([verb_token])
# Here the linear kernel is the same as the cosine similarity, but faster
# cf. http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
scores = linear_kernel(verb_token_vector, tf_idf_matrix)
logger.debug("Corpus-wide TF/IDF scores for '%s': %s" % (verb_token, scores))
logger.debug("Average TF/IDF score for '%s': %f" % (verb_token, average(scores)))
return scores
def preprocess_simple( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True ):
"""
Preprocess a list containing text documents stored as strings, where the documents have already been tokenized and are separated by whitespace
"""
token_pattern = re.compile(r"[\s\-]+", re.U)
def custom_tokenizer( s ):
return [x.lower() for x in token_pattern.split(s) if (len(x) >= min_term_length) ]
# Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
if apply_norm:
norm_function = "l2"
else:
norm_function = None
tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range)
X = tfidf.fit_transform(docs)
terms = []
# store the vocabulary map
v = tfidf.vocabulary_
for i in range(len(v)):
terms.append("")
for term in v.keys():
terms[ v[term] ] = term
return (X,terms)
def _vectorize_documents(self,method='tfidf',max_features=100):
stop_words = []
try:
for lexicon_id in self.params['cluster_lexicons']:
lexicon = Lexicon.objects.get(id=int(lexicon_id))
words = Word.objects.filter(lexicon=lexicon)
stop_words+=[word.wrd for word in words]
except:
KeyError
if method == 'count':
vectorizer = CountVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)
if method == 'tfidf':
vectorizer = TfidfVectorizer(analyzer='word', max_features=max_features, stop_words=stop_words)
document_vectors = vectorizer.fit_transform(self.documents)
document_vectors = document_vectors.toarray()
return document_vectors,vectorizer.get_feature_names()
def generateTfIdfVectorizer(data, stop='english', max_df=0.08, min_df=8):
tokenizer = tokenizer_snowball if stop != 'english' else tokenizer_porter
tfidf = TfidfVectorizer(strip_accents=None,
max_df=max_df,
min_df=min_df,
lowercase=True,
stop_words=stop,
sublinear_tf=True,
tokenizer=tokenizer,
analyzer='word',
max_features=16,
preprocessor=preprocessor)
X = tfidf.fit_transform(data)
print('%d Features: %s' %
(len(tfidf.get_feature_names()), tfidf.get_feature_names()))
return X
def gridSearch(data, params, true_k):
tfidf = TfidfVectorizer(strip_accents=None,
lowercase=True,
sublinear_tf=True,
analyzer='word')
lr_tfidf = Pipeline([('vect', tfidf),
('clf', KMeans(init='k-means++',
n_jobs=-1,
random_state=0,
verbose=0))])
gsTfIdf = GridSearchCV(
lr_tfidf, params, n_jobs=1, verbose=1)
gsTfIdf.fit(data)
print()
print("Best score: %0.3f" % gsTfIdf.best_score_)
print("Best parameters set:")
best_parameters = gsTfIdf.best_estimator_.get_params()
for param_name in sorted(params.keys()):
print("\t%s: %r" % (param_name, best_parameters[param_name]))
def get_vectorizer(self, ngram_range=(1, 3), min_df=2, max_df=1.0):
"""
Define a binary CountVectorizer (Feature Presence) using n-grams and min and max document frequency
:param ngram_range: n-grams are created for all numbers within this range
:param min_df: min document frequency of features
:param max_df: max document frequency of features
:return:
"""
if self.is_weight == 'FP':#Feature Presence
vectorizer = CountVectorizer(ngram_range=ngram_range,
tokenizer=self.tokenize,
min_df=min_df,
max_df=max_df,
binary=True,
stop_words='english')
if self.is_weight == 'TF-IDF':#Feature Presence
vectorizer = TfidfVectorizer(ngram_range=ngram_range,
tokenizer=self.tokenize,
min_df=min_df,
max_df=max_df,
binary=True,
stop_words='english')
return vectorizer
def get_word_clouds(tweets, users, words_n=50, lang='english'):
default_stopwords = set(nltk.corpus.stopwords.words(lang))
stopwords_file = '../data/stopwords.txt'
custom_stopwords = set(open(stopwords_file, 'r').read().splitlines())
all_stopwords = default_stopwords | custom_stopwords
vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=list(all_stopwords))
X = vectorizer.fit_transform(tweets)
terms = vectorizer.get_feature_names()
word_cloud_per_person = {}
for doc in range(len(tweets)):
feature_index = X[doc, :].nonzero()[1]
tfidf_scores = zip(feature_index, [X[doc, x] for x in feature_index])
doc_terms = []
for word, score in [(terms[i], score) for (i, score) in tfidf_scores]:
doc_terms.append((word, score))
important_terms = [(word, score) for word, score in sorted(doc_terms, key=lambda x: x[1], reverse=True)][:words_n]
word_cloud_per_person[users[doc]] = important_terms
return word_cloud_per_person
def delegate_create( self, top, bottom, sample_size=1000, source=sfsf_config.EPUB ):
top_sellers, bottom_sellers = top, bottom
if source == sfsf_config.EPUB:
training_data_top = self.sample_epubs( top_sellers, sample_size )
training_data_bottom = self.sample_epubs( bottom_sellers, sample_size )
else:
training_data_top = self.sample_txts( top_sellers, sample_size )
training_data_bottom = self.sample_txts( bottom_sellers, sample_size )
training_samples_top = [ sample for training_data in training_data_top for sample in training_data[1] ]
training_samples_bottom = [ sample for training_data in training_data_bottom for sample in training_data[1] ]
isbns = [ training_data[0] for training_data in training_data_top for sample in training_data[1] ] + [ training_data[0] for training_data in training_data_bottom for sample in training_data[1] ]
y_narr = numpy.array( [1] * len( training_samples_top ) + [0] * len( training_samples_bottom ) )
vect = TfidfVectorizer( tokenizer = MorePunctuationTokenizer() )
x_tdm = vect.fit_transform( training_samples_top + training_samples_bottom )
print( 'Created training data', ':' )
print( 'x shape', ':', x_tdm.shape )
print( 'y shape', ':', y_narr.shape )
# TODO: make a nicer return structure
return { 'x': x_tdm, 'y': y_narr, 'vectorizer': vect, 'isbns': isbns }
def create_model_from_training_data(self):
training_comments=[]
training_ratings=[]
print("Training classifier model..")
for sentidata in self.training_data:
comments = preprocess_text(sentidata.text)
training_comments.append(comments)
training_ratings.append(sentidata.rating)
# discard stopwords, apply stemming, and discard words present in less than 3 comments
self.vectorizer = TfidfVectorizer(tokenizer=tokenize_and_stem, sublinear_tf=True, max_df=0.5,
stop_words=mystop_words, min_df=3)
X_train = self.vectorizer.fit_transform(training_comments).toarray()
Y_train = np.array(training_ratings)
#Apply SMOTE to improve ratio of the minority class
smote_model = SMOTE(ratio=0.5, random_state=None, k=None, k_neighbors=15, m=None, m_neighbors=15, out_step=.0001,
kind='regular', svm_estimator=None, n_jobs=1)
X_resampled, Y_resampled=smote_model.fit_sample(X_train, Y_train)
model=self.get_classifier()
model.fit(X_resampled, Y_resampled)
return model
def create_pipeline(estimator, reduction=False):
steps = [
('normalize', TextNormalizer()),
('vectorize', TfidfVectorizer(
tokenizer=identity, preprocessor=None, lowercase=False
))
]
if reduction:
steps.append((
'reduction', TruncatedSVD(n_components=10000)
))
# Add the estimator
steps.append(('classifier', estimator))
return Pipeline(steps)
def construct_tf_idf_matrix(data, store=False):
print ("TF-IDF Normalized Matrix Construction...")
vectorizer = TfidfVectorizer(stop_words='english')
print(data)
training_data = vectorizer.fit_transform(data)
print ("Done Constructing Matrix")
print(training_data.toarray())
if store:
print ("Pickling Trained Transformer...")
pickle.dump(vectorizer, open(path_config.TRANSFORMER_PICKLING_FILE, 'wb'))
print ("Pickling Done.")
return training_data
def rf_categorize(email):
# get training corpus
emails = []
db = utils.get_local_db()
for collection in db.collection_names():
for record in db.get_collection(collection).find():
emails.append([collection] + [record['Text']])
# vectorize corpus
labels = [row[0] for row in emails]
data = [row[1] for row in emails]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data)
X = X.toarray()
# vectorize input
email_vector = vectorizer.transform([email])
# create random forest and return prediction
forest = RandomForestClassifier(n_estimators = int(sqrt(len(X[0])))+1)
forest.fit(X, labels)
return forest.predict(email_vector)[0]
04_sent.py 文件源码
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 24
收藏 0
点赞 0
评论 0
def create_union_model(params=None):
def preprocessor(tweet):
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet.replace("-", " ").replace("_", " ")
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
ling_stats = LinguisticVectorizer()
all_features = FeatureUnion(
[('ling', ling_stats), ('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
#all_features = FeatureUnion([('ling', ling_stats)])
clf = MultinomialNB()
pipeline = Pipeline([('all', all_features), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
03_clean.py 文件源码
项目:Building-Machine-Learning-Systems-With-Python-Second-Edition
作者: PacktPublishing
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def create_ngram_model(params=None):
def preprocessor(tweet):
global emoticons_replaced
tweet = tweet.lower()
for k in emo_repl_order:
tweet = tweet.replace(k, emo_repl[k])
for r, repl in re_repl.iteritems():
tweet = re.sub(r, repl, tweet)
return tweet
tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
analyzer="word")
clf = MultinomialNB()
pipeline = Pipeline([('tfidf', tfidf_ngrams), ('clf', clf)])
if params:
pipeline.set_params(**params)
return pipeline
AIserver.py 文件源码
项目:Using-machine-learning-to-detect-malicious-URLs
作者: faizann24
项目源码
文件源码
阅读 22
收藏 0
点赞 0
评论 0
def TL():
allurls = './data/data.csv' #path to our all urls file
allurlscsv = pd.read_csv(allurls,',',error_bad_lines=False) #reading file
allurlsdata = pd.DataFrame(allurlscsv) #converting to a dataframe
allurlsdata = np.array(allurlsdata) #converting it into an array
random.shuffle(allurlsdata) #shuffling
y = [d[1] for d in allurlsdata] #all labels
corpus = [d[0] for d in allurlsdata] #all urls corresponding to a label (either good or bad)
vectorizer = TfidfVectorizer(tokenizer=getTokens) #get a vector for each url but use our customized tokenizer
X = vectorizer.fit_transform(corpus) #get the X vector
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #split into training and testing set 80/20 ratio
lgs = LogisticRegression() #using logistic regression
lgs.fit(X_train, y_train)
print(lgs.score(X_test, y_test)) #pring the score. It comes out to be 98%
return vectorizer, lgs
def process(self, df, x_name, y_name=None, ngrams=2, max_features=35000, method='counts', binary=True, sparse=False):
#choosing the particular flavor of vectorizer
if method == 'counts':
vectorizer = CountVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace', binary=binary)
elif method == 'tfidf':
vectorizer = TfidfVectorizer(max_features=max_features, ngram_range=(1, ngrams), decode_error='replace')
#fitting the vectorizer and converting the counts to an array
full_fit = vectorizer.fit_transform(df[x_name])
full_counts = full_fit.toarray()
self.vocabulary_ = vectorizer.vocabulary_
#passing the attributes up to the class instance
self.data = df
if sparse:
full_counts = csr_matrix(full_counts)
self.X = full_counts
if y_name != None:
self.y = np.array(df[y_name])
return
#splits the data into training and test sets; either called from process()
#or on its own when your text is already vectorized and divided into x and y
def bag_of_words(messages, model=None, weighting=''):
# TODO: Add stemmming or baseform here
messages, stemmings2baseform = texttools.stemming_messages_snowball(messages)
# Create new model for extrating text features if None is given
if model is None:
if weighting == 'tfidf':
model = TfidfVectorizer()
else:
model = CountVectorizer()
model.fit(messages)
# Extract features
x = model.transform(messages)
return x