def train_word2vec(self, min_count=10, size=100, window=5, workers=3):
self.word2vec_model = Word2Vec(Word2vecCorpus(self.corpus_file), min_count=min_count, size=size, window=window, workers=workers)
python类Word2Vec()的实例源码
def __init__(self, df, columns, model_param):
self.df = df
self.columns = columns
self.model_param = model_param
self.model = Word2Vec(sg=self.model_param["sg"],
hs=self.model_param["hs"],
alpha=self.model_param["alpha"],
min_alpha=self.model_param["alpha"],
min_count=self.model_param["min_count"],
size=self.model_param["size"],
sample=self.model_param["sample"],
window=self.model_param["window"],
workers=self.model_param["workers"])
def main():
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
#unsupervised data
hashtag_tweets = 'tweets/hashtag_tweets.gz'
files = [hashtag_tweets]
sentences = MySentences(files=files)
model = models.Word2Vec(sentences, size=100, window=5, min_count=15, workers=8,sg=1,sample=1e-5,hs=1)
model.save_word2vec_format('embeddings/hashtag_tweets_embedding',binary=False)
def main(in_dir, out_loc, task=1, size=128, window=5, min_count=10,
n_workers=4, hs=1, nr_iter=5):
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
model = Word2Vec(
sg=task,
size=size,
window=window,
min_count=min_count,
workers=n_workers,
hs=1,
iter=nr_iter
)
corpus = Corpus(in_dir)
total_words = 0
total_sents = 0
for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
with io.open(text_loc, 'r', encoding='utf8') as file_:
try:
text = file_.read()
except UnicodeDecodeError:
print(text_loc)
total_sents += text.count('\n')
total_words += corpus.count_doc(text.split())
logger.info("PROGRESS: at batch #%i, processed %i words, keeping %i word types",
text_no, total_words, len(corpus.strings))
model.corpus_count = total_sents
model.raw_vocab = defaultdict(int)
for key, string in corpus.strings.items():
model.raw_vocab[string] = corpus.counts[key]
model.scale_vocab()
model.finalize_vocab()
model.iter = nr_iter
model.train(corpus)
# Trims down model
model.init_sims(replace=True)
model.save(out_loc)
def train(self, **kargs) :
self.config.update(kargs)
self.model = _Word2Vec(list(self.database.sentences), **self.config)
delattr(self, "database")
def train_rnas(seq_file = 'utrs.fa', outfile= 'rnadocEmbedding25.pickle'):
min_count = 5
dim = 50
window = 5
print('dim: ' + str(dim) + ', window: ' + str(window))
seq_dict = read_fasta_file(seq_file)
#text = seq_dict.values()
tris = get_6_trids()
sentences = []
for seq in seq_dict.values():
seq = seq.replace('T', 'U')
bag_sen = []
bag_seqs = split_overlap_seq(seq)
for new_seq in bag_seqs:
trvec = get_4_nucleotide_composition(tris, new_seq)
bag_sen.append(trvec)
#for aa in range(len(text)):
sentences.append(bag_sen)
#pdb.set_trace()
print(len(sentences))
model = None
docs = train_tag_doc(sentences)
#model = Word2Vec(sentences, min_count=min_count, size=dim, window=window, sg=1, iter = 10, batch_words=100)
#model = gensim.models.doc2vec.Doc2Vec(docs, size = 50, window = 300, min_count = min_count, workers = 4)
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=min_count, iter=50)
model.build_vocab(docs)
model.train(docs)
'''vocab = list(model.vocab.keys())
print vocab
fw = open('rna_doc_dict', 'w')
for val in vocab:
fw.write(val + '\n')
fw.close()
#print model.syn0
#pdb.set_trace()
embeddingWeights = np.empty([len(vocab), dim])
for i in range(len(vocab)):
embeddingWeights[i,:] = model[vocab[i]]
allWeights.append(embeddingWeights)
'''
#model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])
#with open(outfile, 'w') as f:
# pickle.dump(model, f)
# store the model to mmap-able files
pdb.set_trace()
model.save(outfile)
# load the model back
#model_loaded = Doc2Vec.load(outfile)
def load_w2v(self):
"""
Load Word2Vec embeddings from P2FA files and pre-trained Word2Vec
KeyedVectors text file and store them in the
directory path mentioned in self.embedding_dir.
:returns segment wise feature dictionary for embeddings
:Note: Do not provide KeyedVector file in binary format
"""
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec
is_binary = True if self.embed_model_type == "binary" else False
model = KeyedVectors.load_word2vec_format(self.embed_model_path,
binary = is_binary )
print "Word2Vec model Loaded"
self.embed_model = model
self.embed_length = model.vector_size
if not self.word_dict:
self.load_words()
features = {}
system("mkdir -p "+self.embedding_dir)
for video_id, video_word_data in self.word_dict.iteritems():
video_feats = {}
for segment_id, segment_word_data in video_word_data.iteritems():
video_feats[segment_id] = []
for word_feat in segment_word_data:
start, end, word = word_feat
try:
embed = self.embed_model[word]
except:
embed = np.zeros(self.embed_length)
video_feats[segment_id].append((start, end, embed))
fname = video_id+"_"+segment_id+".csv"
fpath = join(self.embedding_dir, fname)
with open(fpath,"wb") as fh:
# Writing each feature in csv file for segment
for f in video_feats[segment_id]:
f_start = str(f[0])
f_end = str(f[1])
f_val = [str(val) for val in f[2].tolist()]
str2write = ",".join([f_start, f_end] + f_val)
str2write += "\n"
fh.write(str2write)
features[video_id] = video_feats
return features
def makeFeature(df_features):
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
print ('get sentence vector')
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
# model = KeyedVectors.load_word2vec_format('glove.6B.300d.txt', binary=False)
# model = Word2Vec(brown.sents())
df_features['vec1'] = df_features.q1_expand.map(lambda x: getVec(x, model))
df_features['vec2'] = df_features.q2_expand.map(lambda x: getVec(x, model))
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
print ('get six kinds of coefficient about vector')
df_features['f_cosine'] = df_features.apply(lambda x: Cosine(x['vec1'], x['vec2']), axis=1)
df_features['f_manhatton'] = df_features.apply(lambda x: Manhatton(x['vec1'], x['vec2']), axis=1)
df_features['f_euclidean'] = df_features.apply(lambda x: Euclidean(x['vec1'], x['vec2']), axis=1)
df_features['f_pearson'] = df_features.apply(lambda x: PearsonSimilar(x['vec1'], x['vec2']), axis=1)
df_features['f_spearman'] = df_features.apply(lambda x: SpearmanSimilar(x['vec1'], x['vec2']), axis=1)
df_features['f_kendall'] = df_features.apply(lambda x: KendallSimilar(x['vec1'], x['vec2']), axis=1)
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
print ('get 3 kinds of coefficient about from w2c 2 document')
df_features['f_cosine_w2v'] = df_features.apply(lambda x: getfromw2v(x['q1_expand'], x['q2_expand'],Cosine, model), axis=1)
df_features['f_euclidean_w2v'] = df_features.apply(lambda x: getfromw2v(x['q1_expand'], x['q2_expand'],Euclidean, model), axis=1)
df_features['f_manhatton_w2v'] = df_features.apply(lambda x: getfromw2v(x['q1_expand'], x['q2_expand'],Manhatton, model), axis=1)
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
print ('get three kinds of coefficient about nouns, verb, adj')
df_features['f_raw_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1'], x['question2']), axis=1)
df_features['f_raw_dice'] = df_features.apply(lambda x: Dice(x['question1'], x['question2']),axis=1)
df_features['f_raw_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1'], x['question2']), axis=1)
df_features['f_expand_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['q1_expand'], x['q2_expand']), axis=1)
df_features['f_expand_dice'] = df_features.apply(lambda x: Dice(x['q1_expand'], x['q2_expand']),axis=1)
df_features['f_expand_ochiai'] = df_features.apply(lambda x: Ochiai(x['q1_expand'], x['q2_expand']), axis=1)
df_features['f_nouns_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1_nouns'], x['question2_nouns']), axis=1)
df_features['f_nouns_dice'] = df_features.apply(lambda x: Dice(x['question1_nouns'], x['question2_nouns']),axis=1)
df_features['f_nouns_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1_nouns'], x['question2_nouns']), axis=1)
df_features['f_verbs_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1_verbs'], x['question2_verbs']), axis=1)
df_features['f_verbs_dice'] = df_features.apply(lambda x: Dice(x['question1_verbs'], x['question2_verbs']),axis=1)
df_features['f_verbs_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1_verbs'], x['question2_verbs']), axis=1)
df_features['f_adjs_jaccarc'] = df_features.apply(lambda x: Jaccarc(x['question1_adjs'], x['question2_adjs']), axis=1)
df_features['f_adjs_dice'] = df_features.apply(lambda x: Dice(x['question1_adjs'], x['question2_adjs']),axis=1)
df_features['f_adjs_ochiai'] = df_features.apply(lambda x: Ochiai(x['question1_adjs'], x['question2_adjs']), axis=1)
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
print ('get weighted overlap about expand')
weights = word_weights(df_features)
df_features['f_weighted_overlap'] = df_features.apply(lambda x: weighted_Overlap(x['q1_expand'], x['q2_expand'], weights), axis=1)
print('all done')
now = datetime.datetime.now()
print now.strftime('%Y-%m-%d %H:%M:%S')
df_features.fillna(0.0)
return df_features
def get_word_embeddings(num_dimensions=500,
cache_loc=EMBEDDINGS_FILE):
"""Generates word embeddings.
Args:
num_dimensions: int, number of embedding dimensions.
cache_loc: str, where to cache the word embeddings.
Returns:
numpy array representing the embeddings, with shape (NUM_TOKENS,
num_dimensions).
"""
if os.path.exists(cache_loc):
embeddings = np.load(cache_loc)
else:
class SentenceGenerator(object):
def __iter__(self):
iterable = itertools.islice(iterate_qa_pairs(), 1000000)
for i, (question, answer) in enumerate(iterable, 1):
q, a, _, _ = tokenize(question=question, answer=answer,
use_pad=False, include_rev=False)
yield [str(w) for w in q]
yield [str(w) for w in a]
del q, a, w
if i % 1000 == 0:
sys.stderr.write('\rprocessed %d' % i)
sys.stderr.flush()
sys.stderr.write('\rprocessed %d\n' % i)
sys.stderr.flush()
# The default embeddings.
embeddings = np.random.normal(size=(NUM_TOKENS, num_dimensions))
sentences = SentenceGenerator()
model = models.Word2Vec(sentences, size=num_dimensions)
word_vectors = model.wv
del model
# Puts the Word2Vec weights into the right order.
weights = word_vectors.syn0
vocab = word_vectors.vocab
for k, v in vocab.items():
embeddings[int(k)] = weights[v.index]
with open(cache_loc, 'wb') as f:
np.save(f, embeddings)
pass
assert embeddings.shape == (NUM_TOKENS, num_dimensions)
return embeddings
def get_global_embeddings(self, filenames, embedding_size, embedding_dir):
""" Construct the Embedding Matrix for the sentences in filenames.
Args:
filenames: File names of the training files: Based on
which the vocab will be built. This is used when there
are no pretrained embeddings present. Then instead of
using random embeddings, Word2Vec algorithm is used
to train the embeddings on the dataset avaliable.
embedding_size: Dimensions for the embedding to be used.
Returns
Embedding matrix.
"""
sentences = []
if (os.path.exists(embedding_dir + 'vocab_len.pkl')):
vocab_len_stored = pickle.load(open(embedding_dir + "vocab_len.pkl"))
else:
vocab_len_stored = 0
if (vocab_len_stored == self.len_vocab and os.path.exists(embedding_dir + "embeddings.pkl")):
print ("Load file")
self.embeddings = pickle.load(open(embedding_dir + "embeddings.pkl"))
return None
if (os.path.exists(embedding_dir + 'embeddings') == True):
model = KeyedVectors.load_word2vec_format(embedding_dir + 'embeddings', binary = False)
print ("Loading pretriained embeddings")
else:
for file in filenames:
with open(file, 'rb') as f:
for lines in f:
words = [lines.split()]
sentences.extend(words)
model = Word2Vec(sentences, size=embedding_size, min_count=0)
model.save(embedding_dir + 'embeddings')
self.embeddings_model = model
return model
def train_batch_score_cbow_xy_generator(model, scored_word_sentences):
for scored_word_sentence in scored_word_sentences:
#print scored_word_sentence
scored_word_vocabs = [[model.vocab[w],s] for [w,s] in scored_word_sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, scored_word in enumerate(scored_word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
start = max(0, pos - model.window + reduced_window)
window_pos = enumerate(scored_word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
word2_indices = [scored_word2[0].index for pos2, scored_word2 in window_pos if (scored_word2 is not None and scored_word2[0] is not None and pos2 != pos)]
xy_gen=train_cbow_pair(model, scored_word[0] , word2_indices , None, None)
for xy in xy_gen:
if xy !=None:
xy1=[xy[0],xy[1],xy[2],[scored_word[1]]]
yield xy1
# if xy !=None:
# xy1=[xy[0],xy[1],xy[2],scored_word[1]]
# yield xy1
def __init__(self, fname='data/korean_word2vec', dim=300):
self.dim = dim
try:
# load saved model
print('Loading korean word2vec model')
self.model = word2vec.Word2Vec.load(fname)
except:
print(':: There is no word2vec model')
def extract_countries():
countries_vec = {}
vec = word2vec.Word2Vec.load("word2vec")
for line in open("../chapter09/countries.txt", "r"):
country = line.strip().replace(" ", "_")
if country in vec.vocab.keys():
countries_vec[country] = vec[country]
return countries_vec
utterance_embed.py 文件源码
项目:dstc6_dialogue_breakdown_task
作者: JudeLee19
项目源码
文件源码
阅读 21
收藏 0
点赞 0
评论 0
def __init__(self, file_name, dim=300):
self.dim = dim
try:
print('Loading english word2vec model')
self.word2vec_model = word2vec.Word2Vec.load(file_name)
except:
print('Error while loading word2vec model')
def load_embedding(data, embedding_file, binary=True, prefix=None, file_name='embedding.pkl'):
"""
:param data:
:param embedding_file:
:param binary:
:param prefix: if prefix is None, then write to file_name, else load from prefix
:param file_name:
:return:
"""
if prefix == None:
vocab = sorted(reduce(lambda x, y: x | y, (set(sentence) for sentence, _ in data)))
word_idx = dict((c, i) for i, c in enumerate(vocab))
vocab_size = len(word_idx) + 1 # +1 for nil word
# "/home/junfeng/word2vec/GoogleNews-vectors-negative300.bin"
model = word2vec.Word2Vec.load_word2vec_format(embedding_file, binary=binary)
embedding = []
for c in word_idx:
if c in model:
embedding.append(model[c])
else:
embedding.append(np.random.uniform(0.1, 0.1, 300))
embedding = np.array(embedding, dtype=np.float32)
with open(file_name, 'wb') as f:
pickle.dump(embedding, f)
pickle.dump(vocab_size, f)
pickle.dump(word_idx, f)
else:
with open(prefix, 'rb') as f:
embedding = pickle.load(f)
vocab_size = pickle.load(f)
word_idx = pickle.load(f)
return vocab_size, word_idx, embedding
def train_batch_sg(model, sentences, alpha=None, work=None,sub_batch_size=256,batch_size=256):
batch_count=0
sub_batch_count=0
train_x0 =np.zeros((batch_size,sub_batch_size),dtype='int32')
train_x1 =np.zeros((batch_size,sub_batch_size),dtype='int32')
train_y =np.zeros((batch_size,sub_batch_size),dtype='int8')
while 1:
for sentence in sentences:
word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and
model.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
# now go over all words from the (reduced) window, predicting each one in turn
start = max(0, pos - model.window + reduced_window)
#window_length=len(word_vocabs[start:(pos + model.window + 1 - reduced_window)])
#print window_length,
for pos2, word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
# don't train on the `word` itself
if pos2 != pos:
xy_gen=train_sg_pair(model, model.index2word[word.index], word2.index)
for xy in xy_gen :
if xy !=None:
(x0,x1,y)=xy
train_x0[batch_count][sub_batch_count]=x0
train_x1[batch_count][sub_batch_count]=x1
train_y[batch_count][sub_batch_count]=y
sub_batch_count += 1
if sub_batch_count >= sub_batch_size :
batch_count += 1
sub_batch_count=0
if batch_count >= batch_size :
yield { 'index':train_x0, 'point':train_x1, 'code':train_y}
batch_count=0
def train_batch_cbow_xy_generator(model, sentences):
for sentence in sentences:
word_vocabs = [model.vocab[w] for w in sentence if w in model.vocab and model.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
start = max(0, pos - model.window + reduced_window)
window_pos = enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start)
word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
xy_gen=train_cbow_pair(model, word , word2_indices , None, None)
for xy in xy_gen:
if xy !=None:
yield xy
def train_batch_score_sg(model, scored_word_sentences,
score_vector_size,
alpha=None, work=None,
sub_batch_size=256,
batch_size=256):
batch_count=0
sub_batch_count=0
train_x0 =np.zeros((batch_size,sub_batch_size),dtype='int32')
train_x1 =np.zeros((batch_size,sub_batch_size),dtype='int32')
train_y0 =np.zeros((batch_size,sub_batch_size),dtype='int8')
train_y1 =np.zeros((batch_size,sub_batch_size,score_vector_size),dtype='float32')
# train_x0=[[0]]*batch_size
# train_x1=[[0]]*batch_size
# train_y0=[[0]]*batch_size
# train_y1=[[0]]*batch_size
while 1:
for scored_word_sentence in scored_word_sentences:
#sentence=[scored_word2word(scored_word) for scored_word in scored_word_sentence]
word_vocabs = [[model.vocab[w],s] for [w,s] in scored_word_sentence if w in model.vocab and
model.vocab[w].sample_int > model.random.rand() * 2**32]
for pos, scored_word in enumerate(word_vocabs):
reduced_window = model.random.randint(model.window) # `b` in the original word2vec code
word=scored_word2word(scored_word)
# now go over all words from the (reduced) window, predicting each one in turn
start = max(0, pos - model.window + reduced_window)
for pos2, scored_word2 in enumerate(word_vocabs[start:(pos + model.window + 1 - reduced_window)], start):
word2=scored_word2word(scored_word2)
# don't train on the `word` itself
if pos2 != pos:
xy_gen=train_sg_pair(model, model.index2word[word.index], word2.index) #, alpha)
for xy in xy_gen :
if xy !=None:
(x0,x1,y0)=xy
y1=scored_word2score(scored_word)
train_x0[batch_count][sub_batch_count]=x0
train_x1[batch_count][sub_batch_count]=x1
train_y0[batch_count][sub_batch_count]=y0
train_y1[batch_count][sub_batch_count]=y1
sub_batch_count += 1
if sub_batch_count >= sub_batch_size :
batch_count += 1
sub_batch_count=0
if batch_count >= batch_size :
yield { 'index':train_x0, 'point':train_x1, 'code':train_y0,'score':train_y1}
batch_count=0
# train_x0[batch_count]=[x0]
# train_x1[batch_count]=x1
# train_y0[batch_count]=y0
# train_y1[batch_count]=y1
# #print train_x0,train_y1,
# batch_count += 1
# if batch_count >= batch_size :
# #print { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1)}
# #yield { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1,dtype=float32)}
# yield { 'index':np.array(train_x0), 'point':np.array(train_x1), 'code':np.array(train_y0),'score':np.array(train_y1)}
# batch_count=0