def __init__(self, corpus, relationtype, modelname="scikit_classifier"):
super(ScikitRE, self).__init__()
self.modelname = relationtype + "_" + modelname
self.relationtype = relationtype
self.pairtype = relationtype
self.corpus = corpus
self.pairs = []
self.features = []
self.labels = []
self.pred = []
self.clusters = word2vec.load_clusters("corpora/Thaliana/documents-processed-clusters.txt")
self.posfmeasure = make_scorer(f1_score, average='binary', pos_label=True)
self.generate_data(corpus, modelname, relationtype)
self.text_clf = Pipeline([('vect', CountVectorizer(analyzer='char_wb', ngram_range=(3,20), min_df=0.0, max_df=0.7)),
#('vect', CountVectorizer(ngram_range=(1,3), binary=False, max_features=None)),
#('tfidf', TfidfTransformer(use_idf=True, norm="l2")),
#('clf', SGDClassifier(loss='hinge', penalty='l1', alpha=0.0001, n_iter=5, random_state=42)),
#('clf', SGDClassifier())
#('clf', svm.NuSVC(nu=0.01 ))
#('clf', RandomForestClassifier(class_weight={False:1, True:2}, n_jobs=-1))
('clf', MultinomialNB(alpha=0.01, fit_prior=False))
#('clf', DummyClassifier(strategy="constant", constant=True))
])
python类CountVectorizer()的实例源码
def tfidf_feature(xtrain, xtest, stopwords_path):
"""
tf-idf feature
"""
xtrain = [" ".join(word) for word in xtrain]
xtest = [" ".join(word) for word in xtest]
stopwords = codecs.open(stopwords_path, 'r', encoding='utf-8').readlines()
stopwords = [word.strip("\n") for word in stopwords]
vectorizer_train = CountVectorizer(analyzer='word', stop_words=stopwords,min_df=5)
count_train = vectorizer_train.fit_transform(xtrain)
vectorizer_test = CountVectorizer(vocabulary=vectorizer_train.vocabulary_)
count_test = vectorizer_test.fit_transform(xtest)
transformer = TfidfTransformer()
tfidf_train = transformer.fit(count_train).transform(count_train)
tfidf_test = transformer.fit(count_test).transform(count_test)
return tfidf_train.toarray(),tfidf_test.toarray()
def __init__(self, ngram_range=(1, 1), analyzer='word', count=True,
n_features=200):
"""Initializes the classifier.
Args:
ngram_range (tuple): Pair of ints specifying the range of ngrams.
analyzer (string): Determines what type of analyzer to be used.
Setting it to 'word' will consider each word as a unit of language
and 'char' will consider each character as a unit of language.
count (boolean): Determines if features are counts of n-grams
versus a binary value encoding if the n-gram is present or not.
n_features (int): Maximum number of features used.
"""
# checking what type of vectorizer to create
if count:
self.vectorizer = CountVectorizer(analyzer=analyzer,
ngram_range=ngram_range,
max_features=n_features)
else:
self.vectorizer = HashingVectorizer(analyzer=analyzer,
ngram_range=ngram_range,
n_features=n_features)
def evaluate(cat, fold, txt_train, txt_test, y_train, y_test):
fe = CountVectorizer(
preprocessor=normalize,
tokenizer=micro_tokenize,
binary=True,
)
predictor = NBSVM_predictor(
kernel=conf.SVM_KERNEL,
class_weight=conf.SVM_CLWEIGHT,
C=conf.SVM_C,
)
fe.fit(txt_train)
X = fe.transform(txt_train)
predictor.fit(X, y_train)
X_test = fe.transform(txt_test)
y_pred = predictor.predict(X_test)
return y_pred
def evaluate(cat, fold, txt_train, txt_test, y_train, y_test):
fe = CountVectorizer(
preprocessor=normalize,
tokenizer=micro_tokenize,
binary=True,
)
predictor = SVC(
kernel=conf.SVM_KERNEL,
class_weight=conf.SVM_CLWEIGHT,
C=conf.SVM_C,
random_state=conf.SEED,
)
fe.fit(txt_train)
X = fe.transform(txt_train)
predictor.fit(X, y_train)
X_test = fe.transform(txt_test)
y_pred = predictor.predict(X_test)
return y_pred
def compute_VwS(self,s):
""" Compute V(w,S) as defined by Cohen et al.'s IJCAI03 paper """
# Get term-frequency vectors and vocab list for string
cv = CountVectorizer(min_df = 0.0, token_pattern=u'(?u)\\b\\w+\\b')
tf = cv.fit_transform([s]); tf = tf.tocsr()
vocab = cv.vocabulary_
# Compute V(w,S) for string
vprime_ws = dict()
vprime_ws_norm = 0
for w in vocab:
if w in self.CORPUS_VOCAB:
vprime_ws[w] = math.log(tf[0,vocab[w]]+1)*self.LOG_IDF[self.CORPUS_VOCAB[w]]
else:
vprime_ws[w] = math.log(tf[0,vocab[w]]+1)*self.OOV_IDF_VAL #if not in vocab, defauly to OOC_IDF_VAL
vprime_ws_norm += vprime_ws[w]**2
vprime_ws_norm = math.sqrt(vprime_ws_norm)
return (vocab,vprime_ws,vprime_ws_norm)
def bow_to_npy(vocabulary_fname, bow_fname, npy_fname):
''' Vectorize bag-of-words dump and save in NumPy file
PARAMETERS
-----------
vocabulary_fname: str or Path
Vocabulary text file name, with one word on each line.
bow_fname: str or Path
Bag-of-words .txt.gz file name. When uncompressed,
each line represents a document with only lower-case words
separated by space.
npy_fname: str or Path
NumPy .npy file name to write the word count vectors into.
'''
with Path(vocabulary_fname).open('r') as vocabulary_file:
vocabulary = [line.strip() for line in vocabulary_file]
vectorizer = CountVectorizer(vocabulary=vocabulary)
with gzip.open(bow_fname, 'rt') as bow_file:
word_counts = vectorizer.transform(bow_file)
np.save(npy_fname, word_counts)
def test_read_files(self):
docs = ['Lorem ipsum', 'Lorem Lorem ipsum Dolor sit AMET', 'consectetur adipisici elit']
thesaurus = {'13542-1': {'prefLabel': ['ipsum'], 'broader': ['0b'], 'related': ['0r'],
'narrower': ['0n'], 'altLabel': []},
'13542-4': {'prefLabel': ['dolor'], 'broader': ['1b'], 'related': ['1r'],
'narrower': ['1n'], 'altLabel': ['amet']},
}
vocabulary = {'13542-1': 1, '13542-4': 0}
fnames = []
for doc in docs:
file = NamedTemporaryFile(mode='w', delete=False)
fnames.append(file.name)
print(doc, file=file)
cf = ConceptAnalyzer(thesaurus, input='filename')
counter = CountVectorizer(analyzer=cf.analyze, vocabulary=vocabulary, input='filename')
res = counter.fit_transform(fnames).todense()
np.testing.assert_array_almost_equal(res, [[0, 1], [2, 1], [0, 0]])
def get_topic_distributions(examples, vectorizer, lda_model):
"""
Retrieve the topic distributions of a collection of documents.
:param examples: a list of tokenised documents
:param vectorizer: the CountVectorizer used for transforming the documents
:param lda_model: the trained LDA model
:return: an array of shape (num_examples, num_topics) containing the topic
distribution of each example
"""
vectorized_corpus = vectorizer.transform(examples)
gensim_corpus = gensim.matutils.Sparse2Corpus(vectorized_corpus,
documents_columns=False)
topic_representations = []
for doc in gensim_corpus:
topic_representations.append(
[topic_prob for (topic_id, topic_prob) in
lda_model.get_document_topics(doc, minimum_probability=0.)])
return np.array(topic_representations)
# PRE-TRAINED WORD EMBEDDINGS METHODS
def get_word_counts(input_str, limit = 100):
input_str = PreprocessManager.remove_non_ascii(input_str)
wordnet_lemmatizer = WordNetLemmatizer()
snowball_stemmer = EnglishStemmer()
tokenized_text = CountVectorizer().build_tokenizer()(input_str.lower())
tokenized_text = [word for word in tokenized_text if len(word) > 1] # Filter some small words
#tokenized_text = [word for word in tokenized_text if not word.isnumeric()]
filtered_words = [word for word in tokenized_text if word not in stopwords.words('english')]
stemmed_list = [wordnet_lemmatizer.lemmatize(w) for w in filtered_words]
# Calculate frequency distribution
frequency_dist = nltk.FreqDist(stemmed_list)
# Output top 50 words
result = dict()
for word, frequency in frequency_dist.most_common(limit):
# print(u'{};{}'.format(word, frequency))
result[word] = frequency
return result
# This function just splits the words and gives the words that's all!
def getModels(self):
with open(self.data_path + '/categories.pkl', 'rb') as f:
categories = cPickle.load(f)
with open(self.data_path + '/category_map.pkl', 'rb') as f:
category_map = cPickle.load(f)
with open(self.data_path + '/article_classifier_model.pkl', 'rb') as f:
clf = cPickle.load(f)
count_vect = CountVectorizer()
with open(self.data_path + '/count_vect.pkl', 'rb') as f:
count_vect = cPickle.load(f)
tfidf_transformer = TfidfTransformer()
with open(self.data_path + '/tfidf_transformer.pkl', 'rb') as f:
tfidf_transformer = cPickle.load(f)
with open(self.data_path + '/tree.pkl', 'rb') as f:
tree = cPickle.load(f)
return categories, category_map, clf, count_vect, tfidf_transformer, tree
def get_topic_idf(self, sentences):
vectorizer = CountVectorizer()
sent_word_matrix = vectorizer.fit_transform(sentences)
transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False)
tfidf = transformer.fit_transform(sent_word_matrix)
tfidf = tfidf.toarray()
centroid_vector = tfidf.sum(0)
centroid_vector = np.divide(centroid_vector, centroid_vector.max())
# print(centroid_vector.max())
feature_names = vectorizer.get_feature_names()
word_list = []
for i in range(centroid_vector.shape[0]):
if centroid_vector[i] > self.topic_threshold:
# print(feature_names[i], centroid_vector[i])
word_list.append(feature_names[i])
return word_list
def build_analyzer(self):
analyzer = super(TfidfVectorizer, self).build_analyzer()
return lambda doc: (stemmer.stem(w) for w in analyzer(doc))
########## Stemmer + CountVectorizer wrapper #############
def build_analyzer(self):
analyzer = super(CountVectorizer, self).build_analyzer()
return lambda doc: (stemmer.stem(w) for w in analyzer(doc))
########## Defaults TF-IDF & Count Vectorizers ########
#======== TF-IDF Vectorizer =========#
def train_test():
"""Identify accuracy via training set"""
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)
vect = CountVectorizer()
X_train_dtm = vect.fit_transform(X_train) # creates vocab set and dtm for each raw document!
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_train_dtm, y_train)
y_pred_class = nb.predict(X_test_dtm) # make class predictions for X_test_dtm
# w = list(X_test)
return metrics.accuracy_score(y_test, y_pred_class)
# print(train_test())
def __init__(self):
self.clf = LinearSVC()
self.scores = []
self.vectorizer = CountVectorizer(token_pattern=r'[A-z]+', stop_words=english_stops,
ngram_range=(1, 1))
def create_speech(self):
self.speech = dict.fromkeys(self.archives,[])
#blacklist=[] # ids to be ignored, not implemented yet
self.vectorizer = dict.fromkeys(self.archives,[])
self.mat = dict.fromkeys(self.archives,[])
for key in self.speech:
self.speech[key]=[[],[]] # messages / ids / (maybe timestamps?)
self.vectorizer[key]=CountVectorizer(min_df=1)
if key >=0:
continue # why create dictionaries for private messages right now...
logfile="{}.gz".format(os.path.join(self.logpath,str(key)))
try:
ziplines=gzip.open(logfile).read().decode("utf-8").strip("\r\n").split("\n")[-15000:]
except IOError:
print("{} not found".format(logfile))
continue
prev_id = -1
for msg_line in ziplines:
msg = Msg(json.loads(msg_line))
text=msg.get_text()
chat_id=msg.get_chat_id()
if (key != chat_id):
input("Error in your logfile (key {} / chat {})!".format(key,chat_id))
sent_id=msg.get_sent_id()
if text and text[0] not in ["/","!"] and msg.get_edit_date()==0 and not self.is_blacklisted(text) and (not self.find_name(text)) and chat_id and sent_id: #sadly, @like will come through
if sent_id == prev_id:
self.speech[key][0][-1]+="\n{}".format(text)
else:
self.speech[key][0].append(text)
self.speech[key][1].append(sent_id)
prev_id = sent_id
if self.speech[key][0]:
self.mat[key]=self.vectorizer[key].fit_transform(self.speech[key][0])
def compute_tf(data, stopwords_list, language, use_lemmer=True, min_df=2, max_df=0.8):
"""
Compute the tf matrix for the provided data
:param language: 'en' or 'it'
:param data:
:param stopwords_list:
:param use_lemmer:
:param min_df:
:param max_df:
:return:
"""
lemmer_tokenizer = None
if use_lemmer:
if language == 'it':
lemmer_tokenizer = LemNormalizeIt
else:
lemmer_tokenizer = LemNormalize
min_df = min_df if len(data) > min_df else 1
max_df = max_df if max_df * len(data) >= min_df else 1.0
# tf
tf_vectorizer = CountVectorizer(tokenizer=lemmer_tokenizer,
max_df=max_df, min_df=min_df,
max_features=None,
stop_words=stopwords_list,
token_pattern="[a-zA-Z]{3,}")
try:
tf = tf_vectorizer.fit_transform(data)
tf_features_names = tf_vectorizer.get_feature_names()
except:
logging.warning('The computed tf matrix is empty. Check stopwords.')
tf = []
tf_features_names = []
return tf, tf_features_names
textprocess.py 文件源码
项目:scik-learn-learn-Chinese-text-classider
作者: chapzq77
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def voc_count_bag(self):
if (self.wordbag_path == "" or self.vocabulary_count_bag_name == "" or self.stopword_path ==""):
print "wordbag_path(????????) or vocabulary_count_bag_name(?????????) or stopword_path(??????) can not be empty."
return
file_obj = open(self.wordbag_path+self.trainset_name,'rb')
self.data_set = pickle.load(file_obj)
file_obj.close()
#??vocabulary_count_bag?????
self.vocabulary_count_bag.target_name = self.data_set.target_name
self.vocabulary_count_bag.label =self.data_set.label
self.vocabulary_count_bag.filenames =self.data_set.filenames
corpus = self.data_set.content
stopword_list = self.getstopword(self.stopword_path)
#??????????,?????????????
vectorizer = CountVectorizer(stop_words=stopword_list, max_df=500, min_df=1,max_features=10000)
y = vectorizer.fit_transform(corpus)
self.vocabulary_count_bag.vcm = y
self.vocabulary_count_bag.vcm_sum = y.toarray().sum(axis=0)
self.vocabulary_count_bag.vocabulary = vectorizer.get_feature_names()
if not os.path.exists(self.wordbag_path):
os.makedirs(self.wordbag_path)
file_obj1 = open(self.wordbag_path+self.vocabulary_count_bag_name,'wb')
pickle.dump(self.vocabulary_count_bag,file_obj1)
file_obj1.close()
print "????????vocabulary_count_bag???wordbag_path???????vocabulary_count_bag_name??????"
print "#######################################"
#???????
def __init__(self, texts, ids, vocabulary=None, encoding='utf-8'):
"""Definido en la declaracion de la clase.
Attributes:
texts (list of str): Textos a clasificar.
ids (list of str): Identificadores únicos para cada texto (debe
tener la misma longitud que `texts`).
vocabulary (list): Opcional. Vocabulario a tener en cuenta para la
vectorización de los textos. Default: usa todas las palabras
presentes en los textos, salvo los ES_stopwords.txt.
encoding (str): Codificación de los textos en `texts` y en `ids`.
"""
this_dir, this_filename = os.path.split(__file__)
es_stopwords = pd.read_csv(os.path.join(this_dir, 'ES_stopwords.txt'),
header=None, encoding='utf-8')
es_stopwords = list(np.squeeze(es_stopwords.values))
self._check_id_length(ids)
self.vectorizer = CountVectorizer(
input='content', encoding=encoding, decode_error='strict',
strip_accents='ascii', lowercase=True, preprocessor=None,
tokenizer=None, stop_words=es_stopwords, ngram_range=(1, 1),
analyzer='word', max_df=0.8, min_df=1, max_features=None,
vocabulary=vocabulary, binary=False)
self.transformer = TfidfTransformer()
self.ids = None # Matiene una lista ordenada de ids de textos.
self.term_mat = None # Matriz que cuenta los terminos en un texto.
self.tfidf_mat = None # Matriz de relevancia de los terminos.
self.reload_texts(texts, ids)