def run():
'''
????
'''
reload(sys)
sys.setdefaultencoding('utf8')
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
logging.root.setLevel(level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
outp1 = r'wiki_model'
outp2 = r'vector.txt'
model = Word2Vec(sentences, size=400, window=5, min_count=5, workers=multiprocessing.cpu_count())
model.save(outp1)
model.wv.save_word2vec_format(outp2, binary=False)
testData = ['??','??','??','??']
for i in testData:
temp = model.most_similar(i)
for j in temp:
print '%f %s'%(j[1],j[0])
print ''
python类Word2Vec()的实例源码
def trainWord2Vector(sentence_count, vector_dimension, train_count):
lines, model_out, vector_out = "sources/splited_words.txt", "result/word2vec.model", "result/pre_word2vec.vector"
logging.info("??????")
sentences = LineSentence(lines)
# ??min_count=3??????3?? ????????????word2vec.vector?
# workers????????????CPU?? ???3
# sg?????????
model = Word2Vec(sentences, sg=1, size=vector_dimension, window=8,
min_count=0, workers=multiprocessing.cpu_count())
# ????? ??????
for i in range(train_count):
model.train(sentences=sentences, total_examples=sentence_count, epochs=model.iter)
# trim unneeded model memory = use(much) less RAM
# model.init_sims(replace=True)
model.save(model_out)
model.wv.save_word2vec_format(vector_out)
def trainWord2Vector(sentence_count, vector_dimension, train_count):
lines, model_out, vector_out = "com/com/test1/test1sources/splited_words.txt", \
"com/com/test1/test1sources/word2vec.model", \
"com/com/test1/test1sources/word2vec.vector"
logging.info("??????")
sentences = LineSentence(lines)
# ??min_count=3??????3?? ????????????word2vec.vector?
# workers????????????CPU?? ???3
model = Word2Vec(sentences, sg=1, size=vector_dimension, window=8,
min_count=0, workers=multiprocessing.cpu_count())
# ????? ??????
for i in range(train_count):
model.train(sentences=sentences, total_examples=sentence_count, epochs=model.iter)
# trim unneeded model memory = use(much) less RAM
# model.init_sims(replace=True)
model.save(model_out)
model.wv.save_word2vec_format(vector_out)
def uptrain(corpus,
model_path=None,
binary=True,
lockf=0.0,
min_count=1,
size=300,
**word2vec_params):
wv = Word2Vec(min_count=min_count, size=size, **word2vec_params)
print("Building vocabulary...")
wv.build_vocab(corpus)
print("Found %d distinct words." % len(wv.index2word))
if model_path is not None:
print("Intersecting with", model_path, "...")
wv.intersect_word2vec_format(model_path, binary=binary, lockf=lockf)
print("Intersected vectors locked with", lockf)
total_examples = len(corpus)
print("Training on %d documents..." % total_examples)
wv.train(corpus, total_examples=total_examples)
return wv
def create(basedir, num_workers=12, size=320, threshold=5):
"""
Creates a word2vec model using the Gensim word2vec implementation.
:param basedir: the dir from which to get the documents.
:param num_workers: the number of workers to use for training word2vec
:param size: the size of the resulting vectors.
:param threshold: the frequency threshold.
:return: the model.
"""
logging.basicConfig(level=logging.INFO)
sentences = SentenceIter(root=basedir)
model = Word2Vec(sentences=sentences,
sg=True,
size=size,
workers=num_workers,
min_count=threshold,
window=11,
negative=15)
model.save_word2vec_format("{0}-{1}.wordvecs", "{0}-{1}.vocab")
return model
def train_save(self, list_csv):
sentences = MySentences(list_csv)
num_features = 256
min_word_count = 1
num_workers = 20
context = 5
epoch = 20
sample = 1e-5
model = Word2Vec(
sentences,
size=num_features,
min_count=min_word_count,
workers=num_workers,
sample=sample,
window=context,
iter=epoch,
)
#model.save(model_fn)
return model
def main(positive, negative, topn):
""" This method train word2vec model, and return most similar tags
Args:
positive (list): list of positive tags
negative (list): list of negative tags
topn (int): number of top keywords in word2vec
Returns:
list: Return list of word2vec
"""
with open('tags.txt') as f:
content = f.readlines()
sentences = [x.split() for x in content]
model = Word2Vec(sentences, min_count=20)
return model.most_similar(positive=positive, negative=negative, topn=topn)
def fit_embeddings(self, documents):
"""
Train word embeddings of the classification model, using the same parameter values for classification on Gensim ``Word2Vec``.
Similar to use a pre-trained model.
:param documents:
"""
params = self.get_params()
del params['pre_trained']
del params['bucket']
# Word2Vec has not softmax
if params['loss'] == 'softmax':
params['loss'] = 'hs'
LabeledWord2Vec.init_loss(LabeledWord2Vec(), params, params['loss'])
del params['loss']
w2v = Word2Vec(sentences=documents, **params)
self._classifier = LabeledWord2Vec.load_from(w2v)
def learn_embeddings(self, output):
"""
Learn embeddings by optimizing the Skipgram objective using SGD.
"""
self._simulate_walks() # simulate random walks
model = Word2Vec(self._walks, size=self.dimensions, window=self.window_size, min_count=0,
workers=self.workers, iter=self.iter, negative=25, sg=1)
print("defined model using w2v")
model.wv.save_word2vec_format(output, binary=True)
print("saved model in word2vec binary format")
return
def training_word2vec():
sentences = []
read_dir_path = os.path.join(defaultPath.PROJECT_DIRECTORY,sogou_classfication.data_path_jieba)
label_dir_list = os.listdir(read_dir_path)
for label_dir in label_dir_list:
label_dir_path = os.path.join(read_dir_path,label_dir)
label_file_list = os.listdir(label_dir_path)
for label_file in label_file_list:
with open(os.path.join(label_dir_path,label_file),'rb') as reader:
word_list = reader.read().decode('utf-8').replace('\n','').replace('\r','').strip()
sentences.append(word_list)
model_path = os.path.join(defaultPath.PROJECT_DIRECTORY,sogou_classfication.word2Vect_path)
if not os.path.exists(model_path):
os.makedirs(model_path)
model_save_path = os.path.join(model_path,sogou_classfication.model_name)
model = Word2Vec(sentences,max_vocab_size=None,window=8,size=256,min_count=5,workers=4,iter=20)
model.save(model_save_path)
def load_save_word2vec_model(line_words, model_filename):
# ????
feature_size = 500
content_window = 5
freq_min_count = 3
# threads_num = 4
negative = 3 #best????hierarchical softmax??(??????????)????negative sampling??(??????)?
iter = 20
print("word2vec...")
tic = time.time()
if os.path.isfile(model_filename):
model = models.Word2Vec.load(model_filename)
print(model.vocab)
print("Loaded word2vec model")
else:
bigram_transformer = models.Phrases(line_words)
model = models.Word2Vec(bigram_transformer[line_words], size=feature_size, window=content_window, iter=iter, min_count=freq_min_count,negative=negative, workers=multiprocessing.cpu_count())
toc = time.time()
print("Word2vec completed! Elapsed time is %s." % (toc-tic))
model.save(model_filename)
# model.save_word2vec_format(save_model2, binary=False)
print("Word2vec Saved!")
return model
def learn_embeddings(self, output, output_format='binary'):
"""
Learn embeddings by optimizing the Skipgram objective using SGD.
"""
self._simulate_walks() # simulate random walks
model = Word2Vec(self._walks, size=self.dimensions, window=self.window_size, min_count=0,
workers=self.workers, iter=self.iter, negative=25, sg=1)
print("defined model using w2v")
is_binary = output_format != 'text'
model.wv.save_word2vec_format(output, binary=is_binary)
actual_format = 'text' if output_format == 'text' else 'binary'
print("saved model in word2vec %s format" % actual_format)
return
def main(lang, in_dir, out_loc, negative=5, n_workers=4, window=5, size=128, min_count=10, nr_iter=2):
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = Word2Vec(
size=size,
window=window,
min_count=min_count,
workers=n_workers,
sample=1e-5,
negative=negative
)
nlp = spacy.load(lang, parser=False, tagger=False, entity=False)
corpus = Corpus(in_dir)
total_words = 0
total_sents = 0
for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
with io.open(text_loc, 'r', encoding='utf8') as file_:
text = file_.read()
total_sents += text.count('\n')
doc = nlp(text)
total_words += corpus.count_doc(doc)
logger.info("PROGRESS: at batch #%i, processed %i words, keeping %i word types",
text_no, total_words, len(corpus.strings))
model.corpus_count = total_sents
model.raw_vocab = defaultdict(int)
for orth, freq in corpus.counts:
if freq >= min_count:
model.raw_vocab[nlp.vocab.strings[orth]] = freq
model.scale_vocab()
model.finalize_vocab()
model.iter = nr_iter
model.train(corpus)
model.save(out_loc)
def gen_embeddings(in_file, out_file, size=100):
corpus = LineSentence(in_file)
model = Word2Vec(
sentences=corpus, size=size, alpha=0.025, window=5, min_count=5,
max_vocab_size=None, sample=1e-3, seed=1, workers=3, min_alpha=0.0001,
sg=0, hs=0, negative=5, cbow_mean=1, hashfxn=hash, iter=5, null_word=0,
trim_rule=None, sorted_vocab=1
)
model.save_word2vec_format(out_file, binary=False)
def fit(self, tokens):
# get most frequent items for plotting:
tokens = [t.lower() for t in tokens]
self.mfi = [t for t,_ in Counter(tokens).most_common(self.nb_mfi)]
self.sentence_iterator = SentenceIterator(tokens=tokens)
# train embeddings:
self.w2v_model = Word2Vec(self.sentence_iterator,
window=self.window,
min_count=self.minimum_count,
size=self.size,
workers=self.nb_workers,
negative=self.nb_negative)
self.plot_mfi()
self.most_similar()
# build an index of the train tokens
# which occur at least min_count times:
self.token_idx = {'<UNK>': 0}
for k, v in Counter(tokens).items():
if v >= self.minimum_count:
self.token_idx[k] = len(self.token_idx)
# create an ordered vocab:
self.train_token_vocab = [k for k, v in sorted(self.token_idx.items(),\
key=itemgetter(1))]
self.pretrained_embeddings = self.get_weights(self.train_token_vocab)
return self
def zhword2vec(ifname, fmodel):
'''Training the word2vec word
more: http://radimrehurek.com/gensim/models/word2vec.html
'''
model = Word2Vec(LineSentence(ifname), size = 400, window = 5,
min_count = 2, workers = multiprocessing.cpu_count(),negative = 5)
model.save(fmodel)
# model.save_word2vec_format(fword2vec, binary=False)
def train_model(self, ofmodel, space = ' '):
if self.traincorpusfname == None or not os.path.exists():
ifname = self.__pretrain_model(space)
else:
ifname = self.traincorpusfname
self.logger.info('+++++++++++++++Train Model Start+++++++++++++++++\n')
#
# Calling Gensim 3rdparty lib, Training the word2vec word
# more: http://radimrehurek.com/gensim/models/word2vec.html
model = Word2Vec(LineSentence(ifname), size = 400, window = 5,
min_count = 2, workers = multiprocessing.cpu_count(),negative = 5)
self.logger.info('+++++++++++++++Train Model Finished+++++++++++++++++\n')
model.save(ofmodel)
return (model, ofmodel)
# if __name__=='__main__':
# if len(sys.argv) < 3:
# print(globals()['__doc__'] %locals())
# sys.exit(1)
# inp, outp =sys.argv[1:3]
# #inp = '../../data/zhwiki-latest-pages-articles.xml.bz2','r'
# #outp = '../../model/word2vec.model'
# wiki = tWikiCorpus(inp, _lemmatize=False, _dictionary={})
# print 'wiki'
# wiki.getTexts(outp, space=' ')
def train_model(self, ofmodel, space = ' '):
if self.traincorpusfname == None or not os.path.exists(self.traincorpusfname):
ifname = self.pretrain_model(space)
else:
ifname = self.traincorpusfname
self.logger.info('+++++++++++++++Train Model Start+++++++++++++++++\n')
#
# Calling Gensim 3rdparty lib, Training the word2vec word
# more: http://radimrehurek.com/gensim/models/word2vec.html
model = Word2Vec(LineSentence(ifname), size = 400, window = 5,
min_count = 2, workers = multiprocessing.cpu_count(),negative = 5)
self.logger.info('+++++++++++++++Train Model Finished+++++++++++++++++\n')
model.save(ofmodel)
return (model, ofmodel)
def train_word_2_vec(self,model_save_file_name='../../temp_results/word2vec_hindi.txt'):
model = Word2Vec(LineSentence(self.raw_file_name), size=300,workers=multiprocessing.cpu_count())
model.wv.save_word2vec_format(model_save_file_name, binary=False)
def train_and_save(sents, output_file, options = {}):
print "Training model..."
model = Word2Vec(sents, **options)
model.save(output_file)
def __init__(self, loss='softmax', bucket=0, **kwargs):
"""
Exactly as the parent class `Word2Vec <https://radimrehurek.com/gensim/models/word2vec.html#gensim.models.word2vec.Word2Vec>`_.
Some parameter values are overwritten (e.g. sg=0 because we never use skip-gram here), look at the code for details.
Argument names must be explicit!
`loss` = one value in {ns, hs, softmax}. If "ns" is selected negative sampling will be used
as loss function, together with the parameter `negative`. With "hs" hierarchical softmax will be used,
while with "softmax" (default) the sandard softmax function (the other two are "approximations").
The `hs` argument does not exist anymore.
`bucket` is the maximum number of hashed words, i.e., we limit the feature space to this number,
ergo we use the hashing trick in the word vocabulary. Default to 0, NO hashing trick
It basically builds two vocabularies, one for the sample words and one for the labels,
so that the input layer is only made of words, while the output layer is only made of labels.
**Parent class methods that are not overridden here are not tested and not safe to use**.
"""
self.lvocab = {} # Vocabulary of labels only
self.index2label = []
kwargs['sg'] = 0
kwargs['window'] = sys.maxsize
kwargs['sentences'] = None
kwargs['hashfxn'] = custom_hash # Force a consistent function across different Python versions
self.softmax = self.init_loss(kwargs, loss)
self.bucket = bucket
super(LabeledWord2Vec, self).__init__(**kwargs)
def train(self, sentences, total_words=None, word_count=0,
total_examples=None, queue_factor=2, report_delay=1.0):
"""
Update the model's neural weights from a sequence of sentences (can be a once-only generator stream).
For Word2Vec, each sentence must be a list of unicode strings. (Subclasses may accept other examples.)
To support linear learning-rate decay from (initial) alpha to min_alpha, either total_examples
(count of sentences) or total_words (count of raw words in sentences) should be provided, unless the
sentences are the same as those that were used to initially build the vocabulary.
"""
if self.bucket > 0:
sentences = HashIter(sentences, self.bucket, with_labels=True)
if (self.model_trimmed_post_training):
raise RuntimeError("Parameters for training were discarded using model_trimmed_post_training method")
if FAST_VERSION < 0:
import warnings
warnings.warn("C extension not loaded for Word2Vec, training will be slow. "
"Install a C compiler and reinstall gensim for fast training.")
self.neg_labels = []
if self.negative > 0:
# precompute negative labels optimization for pure-python training
self.neg_labels = zeros(self.negative + 1)
self.neg_labels[0] = 1.
return super(LabeledWord2Vec, self).train(sentences, total_words, word_count,
total_examples, queue_factor, report_delay)
def load_from(cls, other_model):
"""
Import data and parameter values from other model
:param other_model: A ``LabeledWord2Vec`` object, or a ``Word2Vec`` or ``KeyedVectors`` object of Gensim
"""
softmax = getattr(other_model, 'softmax', False)
if softmax:
loss = 'softmax'
elif not other_model.hs and other_model.negative:
loss = 'ns'
else:
loss = 'hs'
new_model = LabeledWord2Vec(
loss=loss,
negative=other_model.negative if loss == 'ns' else 0,
size=other_model.vector_size,
seed=other_model.seed
)
new_model.reset_from(other_model)
for attr in vars(other_model):
if hasattr(new_model, attr):
if not isinstance(other_model, LabeledWord2Vec) and (attr == 'syn1' or attr == 'syn1neg'):
continue
value = getattr(other_model, attr, getattr(new_model, attr))
if isinstance(value, KeyedVectors):
new_model.wv.syn0 = value.syn0
new_model.wv.syn0norm = value.syn0norm
else:
setattr(new_model, attr, value)
return new_model
def load_w2v(corpus, dictionary):
'''
Return the trained Word2Vec model
Train a model if model doesn't exist yet
:param corpus:
:param dictionary:
:return:
'''
if not os.path.isfile(W2V_MODEL_PATH):
num_features = 300 # Word vector dimensionality
min_word_count = 5 # Minimum word count
num_workers = 5 # Number of threads to run in parallel
window = 5 # Context window size
downsampling = 1e-5 # Downsample setting for frequent words
print("Training the word2vec model!")
sents = get_review_sentences()
# Initialize and train the model (this will take some time)
model = models.Word2Vec(sents, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = window, sample = downsampling)
# If you don't plan to train the model any further, calling
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)
# It can be helpful to create a meaningful model name and
# save the model for later use. You can load it later using Word2Vec.load()
model.save(W2V_MODEL_PATH)
tfidf = models.Word2Vec(corpus)
print('Word2vec model created!')
print('Loading word2vec model')
w2v = models.Word2Vec.load(W2V_MODEL_PATH)
print('Loading word2vec model complished!')
return w2v
frequent_pattern Item.py 文件源码
项目:Recommendation-based-on-sequence-
作者: Bereket123
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def main():
load_sequence('/home/beki/Documents/2nd Year/BD & DM Project/retail_dataset.csv')
# split patterns to train_patterns and test_patterns
train_patterns = np.random.choice(patterns, np.floor(len(patterns) * 0.8))
test_patterns = np.random.choice(patterns, np.floor(len(patterns) * 0.2))
# Word vector representation learning
model = Word2Vec(train_patterns, size=15, window=3, min_count=1, workers=1, iter=3, sample=1e-4, negative=20)
# Test
test_size = float(len(test_patterns))
hit = 0.0
for current_pattern in test_patterns:
if len(current_pattern) < 2:
test_size -= 1.0
continue
# Reduce the current pattern in the test set by removing the last item
last_item = current_pattern.pop()
# Keep those items in the reduced current pattern, which are also in the models vocabulary
items = [it for it in current_pattern if it in model.vocab]
if len(items) <= 2:
test_size -= 1.0
continue
# Predict the most similar items to items
prediction = model.most_similar(positive=items)
# Check if the item that we have removed from the test, last_item, is among
# the predicted ones.
for predicted_item, score in prediction:
if predicted_item == last_item:
hit += 1.0
#print last_item
#print prediction
print 'Accuracy like measure: {}'.format(hit / test_size)
def learn_embeddings():
'''
Learn embeddings by optimizing the Skipgram objective using SGD.
'''
logging.info("Initializing creation of the representations...")
walks = LineSentence('random_walks.txt')
model = Word2Vec(walks, size=args.dimensions, window=args.window_size, min_count=0, hs=1, sg=1, workers=args.workers, iter=args.iter)
model.wv.save_word2vec_format(args.output)
logging.info("Representations created.")
return
def make_word2vec():
data_path = tv_classfication.tv_data_path
sentence = data_work(data_path)
model = Word2Vec(sentence,size=256,workers=4,window=10,iter=30)
model.save(tv_classfication.word2vec_path)
__main__.py 文件源码
项目:GraphEmbeddingsRecommenderSystems
作者: himangshunits
项目源码
文件源码
阅读 28
收藏 0
点赞 0
评论 0
def process(args):
# Create a graph from the training set
nodedict = graph.records_to_graph()
# Build the model using DeepWalk and Word2Vec
G = graph.load_adjacencylist("out.adj", undirected=True)
####################################################################################################################################################################
# Code Written for BI Project : Author : Himangshu Ranjan Borah(hborah)
####################################################################################################################################################################
# call the build_deepwalk_corpus function
# Take and populate the arguments from the command lines.
generated_walks = graph.build_deepwalk_corpus(G = G, num_paths = args.number_walks, path_length = args.walk_length, alpha=0, rand=random.Random(0))
# Call word2vec to build the model.
# print generated_walks
# The structure Looks like ['32173', '32168'], ['124010', '22676'], ['17792', '72925'],
model = Word2Vec(generated_walks, size=args.representation_size, window=args.window_size, min_count=0, workers=args.workers)
####################################################################################################################################################################
# Code Written for BI Project : Author : Himangshu Ranjan Borah(hborah)
####################################################################################################################################################################
# Perform some evaluation of the model on the test dataset
with open("./data/test_user_ratings.dat") as fin:
fin.next()
groundtruth = [line.strip().split("\t")[:3] for line in fin] # (user, movie, rating)
tr = [int(round(float(g[2]))) for g in groundtruth]
pr = [predict_rating(model, nodedict, "u"+g[0], "m"+g[1]) for g in groundtruth]
print "MSE = %f" % mean_squared_error(tr, pr)
print "accuracy = %f" % accuracy_score(tr, pr)
cm = confusion_matrix(tr, pr, labels=range(1,6))
print cm
def word2vec_train(input_file, output_file):
sentences = word2vec.LineSentence(input_file)
model = Word2Vec(sentences, size=300, min_count=10, sg=0, workers=multiprocessing.cpu_count())
model.save(output_file)
model.save_word2vec_format(output_file + '.vector', binary=True)
def train():
extract_sentece()
in_path = './Data/corpus/sentence.txt'
out_path = './Data/embedding/word2vec.bin'
# ????
model = Word2Vec(
sg=1, sentences=LineSentence(in_path),
size=256, window=5, min_count=3, workers=4, iter=40)
model.wv.save_word2vec_format(out_path, binary=True)