def __init__(self, num_topics=100, min_word_count=20,
top_most_common_words=10, min_doc_length=40,
max_doc_length=1000, random_state=None):
self.num_topics = num_topics
self.min_word_count = min_word_count
self.top_most_common_words = top_most_common_words
assert max_doc_length > min_doc_length, \
"max_doc_length must be greater than min_doc_length"
self.min_doc_length = min_doc_length
self.max_doc_length = max_doc_length
self.random_state = random_state
# natural language processing
self.stop_words = self.getEnglishStopWords()
self.bigramizer = Phrases()
python类Phrases()的实例源码
def add_phrases(self, corpus):
'''
Parameters
----------
corpus: Corpus for phrase augmentation
Returns
-------
New ParsedCorpus containing unigrams in corpus and new phrases
'''
assert isinstance(corpus, ParsedCorpus)
self.phrases = [Phrases(CorpusAdapterForGensim.get_sentences(corpus), delimiter=' ')]
for i in range(1, self.max_tokens_per_phrase):
self.phrases.append(Phrases(self.phrases[-1][CorpusAdapterForGensim.get_sentences(corpus)]))
return self
def load_save_word2vec_model(line_words, model_filename):
# ????
feature_size = 500
content_window = 5
freq_min_count = 3
# threads_num = 4
negative = 3 #best????hierarchical softmax??(??????????)????negative sampling??(??????)?
iter = 20
print("word2vec...")
tic = time.time()
if os.path.isfile(model_filename):
model = models.Word2Vec.load(model_filename)
print(model.vocab)
print("Loaded word2vec model")
else:
bigram_transformer = models.Phrases(line_words)
model = models.Word2Vec(bigram_transformer[line_words], size=feature_size, window=content_window, iter=iter, min_count=freq_min_count,negative=negative, workers=multiprocessing.cpu_count())
toc = time.time()
print("Word2vec completed! Elapsed time is %s." % (toc-tic))
model.save(model_filename)
# model.save_word2vec_format(save_model2, binary=False)
print("Word2vec Saved!")
return model
def train_model(in_file_name, out_file_name, use_plain_word2vec=False, size=100, phrases_n_gram=1, threads=4):
options = {
'size': size,
}
if use_plain_word2vec:
if phrases_n_gram > 1:
phrases_file_name = '{}.phrases'.format(in_file_name)
word2vec.word2phrase(in_file_name, phrases_file_name, verbose=True)
in_file_name = phrases_file_name
if threads:
options['threads'] = threads
# noinspection PyCallingNonCallable
word2vec.word2vec(in_file_name, out_file_name, verbose=True, **options)
else:
sentences = LineSentence(in_file_name)
for i in range(phrases_n_gram - 1):
n_gram_transformer = Phrases(sentences)
sentences = n_gram_transformer[sentences]
if threads:
options['workers'] = threads
model = Word2Vec(sentences, **options)
model.save(out_file_name)
def __init__(self, phrases, gram_size):
'''
Parameters
----------
phrases : list[gensim.models.Phrases]
gram_size : int, maximum number of words per phrase
kwargs : parameters for FeatsFromSpacyDoc.init
'''
print('xxx')
phrases = phrases
gram_size = gram_size
assert type(phrases) == Phrases
self.gram_size = gram_size
self.phrases = phrases
def _scan_and_build_vocab(self):
from gensim.models import Phrases
bigram_transformer = Phrases(CorpusAdapterForGensim.get_sentences(self.corpus))
self.model.scan_vocab(bigram_transformer[CorpusAdapterForGensim.get_sentences(self.corpus)])
self.model.build_vocab(bigram_transformer[CorpusAdapterForGensim.get_sentences(self.corpus)])
def trainPhrasesModel(tweets):
"""
Train phrases model, experimental, not used
:param tweets: list of tokenised tweets
:return:
"""
print("Learning multiword expressions")
bigram = Phrases(tweets)
bigram.save("../out/phrase_all.model")
print("Sanity checking multiword expressions")
test = "i like donald trump , go hillary clinton , i like jesus , jesus , legalisation abortion "
sent = test.split(" ")
print(bigram[sent])
return bigram[tweets]
def __init__(self, lang, tokenizer=None, load=True):
self.lang = lang
self.tokenizer = tokenizer or Tokenizer(lang)
dirname = join(nlp_data, lang)
dict_fname = join(dirname, DICTIONARY_FNAME)
phrase_fname = join(dirname, PHRASES_FNAME)
if load and exists(phrase_fname):
self.phrases = gmodels.Phrases.load(phrase_fname)
else:
self.phrases = gmodels.Phrases()
if load and exists(dict_fname):
self.dictionary = corpora.Dictionary.load(dict_fname)
else:
self.dictionary = corpora.Dictionary()