def load_w2v(self):
"""
Load Word2Vec embeddings from P2FA files and pre-trained Word2Vec
KeyedVectors text file and store them in the
directory path mentioned in self.embedding_dir.
:returns segment wise feature dictionary for embeddings
:Note: Do not provide KeyedVector file in binary format
"""
from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec
is_binary = True if self.embed_model_type == "binary" else False
model = KeyedVectors.load_word2vec_format(self.embed_model_path,
binary = is_binary )
print "Word2Vec model Loaded"
self.embed_model = model
self.embed_length = model.vector_size
if not self.word_dict:
self.load_words()
features = {}
system("mkdir -p "+self.embedding_dir)
for video_id, video_word_data in self.word_dict.iteritems():
video_feats = {}
for segment_id, segment_word_data in video_word_data.iteritems():
video_feats[segment_id] = []
for word_feat in segment_word_data:
start, end, word = word_feat
try:
embed = self.embed_model[word]
except:
embed = np.zeros(self.embed_length)
video_feats[segment_id].append((start, end, embed))
fname = video_id+"_"+segment_id+".csv"
fpath = join(self.embedding_dir, fname)
with open(fpath,"wb") as fh:
# Writing each feature in csv file for segment
for f in video_feats[segment_id]:
f_start = str(f[0])
f_end = str(f[1])
f_val = [str(val) for val in f[2].tolist()]
str2write = ",".join([f_start, f_end] + f_val)
str2write += "\n"
fh.write(str2write)
features[video_id] = video_feats
return features
评论列表
文章目录