def main():
#signature()
# sentences = TextLoader()
# model = gensim.models.Word2Vec(sentences, workers=8)
# model.save('word2vector.model')
# print 'word2vec ok'
# word2vec = Word2vec()
# word2vec.BetweenToVec()
# pro_pro()
# OneHot()
#
# c = cluster()
# c.Cluster(0.7, 'one_hot_vec.txt', '4_cluster.txt')
negtive_bet_many()
python类models()的实例源码
def text_to_vector(sentence_list, MAX_SENTENCE=78, model=None):
if model is None:
model = models.Word2Vec.load_word2vec_format(
local_ref('../storage/pos_tagger/GoogleNews-vectors-negative300.bin'), binary=True)
X = np.zeros((MAX_SENTENCE, len(sentence_list), 300))
capitals = np.zeros((MAX_SENTENCE, len(sentence_list), 3))
vectorize = lambda x: model[x] if x in model else np.zeros(300)
mask = []
for i, sentence in enumerate(sentence_list):
for j, word in enumerate(sentence):
if j == MAX_SENTENCE:
j -= 1
break
X[j][i] = vectorize(word)
capitals[j][i] = cap_vector(word)
mask.append(j + 1)
mask = np.array(mask)
return X, capitals, mask
def get_word(word):
inst = re.search(r"_\(([A-Za-z0-9_]+)\)", word)
if inst == None:
length = len(word.split("_"))
if length < 5:
return True, word
else:
if inst.group(1) != "disambiguation":
word2 = re.sub(r'_\(.+\)','',word)
if len(word2.split(" ")) <5:
return True, word
return False,word
# Load the trained doc2vec and word2vec models.
def create_word2vec_model(embedding_size, input_file=TEXT_DIR):
"""
Create the word2vec model based on the given embedding size and the corpus file.
:param embedding_size: The embedding size
:param input_file: The corpus file
"""
word2vec_file = 'word2vec_' + str(embedding_size) + '.model'
if os.path.isfile(word2vec_file):
logging.info('? The word2vec model you want create already exists!')
else:
sentences = word2vec.LineSentence(input_file)
# sg=0 means use CBOW model(default); sg=1 means use skip-gram model.
model = gensim.models.Word2Vec(sentences, size=embedding_size, min_count=0,
sg=0, workers=multiprocessing.cpu_count())
model.save(word2vec_file)
def load_word2vec_matrix(vocab_size, embedding_size):
"""
Return the word2vec model matrix.
:param vocab_size: The vocab size of the word2vec model file
:param embedding_size: The embedding size
:return: The word2vec model matrix
"""
word2vec_file = 'word2vec_' + str(embedding_size) + '.model'
if os.path.isfile(word2vec_file):
model = gensim.models.Word2Vec.load(word2vec_file)
vocab = dict([(k, v.index) for k, v in model.wv.vocab.items()])
vector = np.zeros([vocab_size, embedding_size])
for key, value in vocab.items():
if len(key) > 0:
vector[value] = model[key]
return vector
else:
logging.info("? The word2vec file doesn't exist. "
"Please use function <create_vocab_size(embedding_size)> to create it!")
keyterm_classification.py 文件源码
项目:contextual-advertising-deploy
作者: andreicnica
项目源码
文件源码
阅读 16
收藏 0
点赞 0
评论 0
def __init__(self, classes=None, classesFile=None,
classesClusterPath=None,
modelPath="dataset/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin", modelBinary=True):
if not(classes) and not(classesFile):
print "ERROR MUST LOAD CLASS FILE"
return 202
if not(classes):
classes = self.load_adv_keyterms_from_file(classesFile)
#load cluster
if classesClusterPath:
self.classesClusters = load_cluster_dataset(classesClusterPath)
else:
#process cluster from classes
#TODO
self.classesClusters = None
self.model = gensim.models.Word2Vec.load_word2vec_format(modelPath, binary=modelBinary)
self._preProcessClasses(classes)
def train_artistsong2vec_model(fout_path, input_datas=None, data_path=None,
min_count=5, sorted_vocab=1, window=10,
size=250,
iter_n=50):
if not input_datas and data_path:
input_datas = pickle.load(open(data_path, 'rb'))
full_data = []
for i in input_datas:
tmp = []
for j in i:
tmp.append(j[0])
tmp.append(j[1])
full_data.append(tmp)
data_process_logger.info('start training')
wv_model = gensim.models.Word2Vec(full_data, min_count=min_count, sorted_vocab=sorted_vocab, window=window,
size=size, iter=iter_n)
with open(fout_path, 'wb') as fout:
data_process_logger.info('start saving model')
pickle.dump(wv_model, fout)
print 'model saved'
def main():
# sentences = TextLoader()
# model = gensim.models.Word2Vec(sentences, sg=1, min_count=5, size=50, workers=8, window=5)
# model.save('word2vector.model')
# print 'word2vec ok'
#
# pro_cluster('error.txt', 'error_cluster_word2vec.txt', 0.2)
all_cluster()
def main():
sentences = TextLoader()
model = gensim.models.Word2Vec(sentences, sg=1, min_count=5, size=50, workers=8, window=2)
model.save('word2vector.model')
print 'word2vec ok'
# word2vec = Word2vec()
# word2vec.BetweenToVec()
def create_embedding_matrix(model):
# convert the wv word vectors into a numpy matrix that is suitable for insertion
# into our TensorFlow and Keras models
embedding_matrix = np.zeros((len(model.wv.vocab), vector_dim))
for i in range(len(model.wv.vocab)):
embedding_vector = model.wv[model.wv.index2word[i]]
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix
def train_tag_doc(doc1):
docs = []
analyzedDocument = namedtuple('AnalyzedDocument', 'words tags')
for i, text in enumerate(doc1):
words = text.lower().split()
tags = [i]
docs.append(analyzedDocument(words, tags))
#docs.append(gensim.models.doc2vec.TaggedDocument(words, [i]))
return doc
def loadIndexes(self):
if self.model or self.idfIndex:
return
#global model
print "loading word2vec model"
self.model = gensim.models.KeyedVectors.load_word2vec_format("models/GoogleNews-vectors-negative300.bin", binary=True) # C text format
print "done"
#model = gensim.models.Word2Vec.load_word2vec_format("models/glove_model.txt", binary=False) # C text format
#global idfIndex
print "loading idfIndex model"
self.idfIndex = indexManager.getIndex("plainIdfIndex.txt")
print "done"
#return (model, idfIndex)
def test2(self):
topWords = self.getTopTfIdfTerms("Jerusalem")
for word in topWords:
#print word, idfIndex[word] if word in idfIndex else 1.5
try:
print self.model.most_similar(positive=[word], topn=10)
except:
print "word not in vocabulary"
#print model.accuracy(r"C:\Users\David\workspace\Wiki\gitWiki\questions-words.txt")
#model = word2vec.Word2Vec(sentences)
#model = word2vec.Word2Vec.load_word2vec_format("C:\Users\David\workspace\Wiki\gitWiki\text8-queen", binary=False)
#model = gensim.models.Word2Vec.load_word2vec_format('/tmp/vectors.txt', binary=False)
def train_song2vec_model(fout_path, input_datas=None, data_path=None, min_count=5, sorted_vocab=1, window=10, size=250,
iter_n=50):
"""
??song2vec??
Args:
fout_path:
input_datas:
data_path:
min_count:
sorted_vocab:
window:
size:
iter_n:
Returns:
"""
if not input_datas and data_path:
input_datas = pickle.load(open(data_path, 'rb'))
data_process_logger.info('start training')
random.shuffle(input_datas)
input_datas = input_datas[:45000]
wv_model = gensim.models.Word2Vec(input_datas, min_count=min_count, sorted_vocab=sorted_vocab, window=window,
size=size, iter=iter_n)
with open(fout_path, 'wb') as fout:
data_process_logger.info('start saving model')
pickle.dump(wv_model, fout)
print 'model saved'
def train_artist2vec_model(fout_path, input_datas=None, data_path=None, min_count=5, sorted_vocab=1, window=10,
size=250,
iter_n=50):
if not input_datas and data_path:
input_datas = pickle.load(open(data_path, 'rb'))
data_process_logger.info('start training')
wv_model = gensim.models.Word2Vec(input_datas, min_count=min_count, sorted_vocab=sorted_vocab, window=window,
size=size, iter=iter_n)
with open(fout_path, 'wb') as fout:
data_process_logger.info('start saving model')
pickle.dump(wv_model, fout)
print 'model saved'
def load(self):
# disambiguator params
print('[{}] Loading <disambiguator_weights>'.format(str(datetime.now())))
disambiguator_weights = np.load(local_ref('../storage/sentence_disambiguation/trained_weights.npy'))
print('[{}] Loading <disambiguator_tag_counts>'.format(str(datetime.now())))
with open(local_ref('../storage/sentence_disambiguation/brown_tag_distribution.pkl')) as fp:
disambiguator_tag_counts = cPickle.load(fp)
print('[{}] Loading <disambiguator_tag_order>'.format(str(datetime.now())))
with open(local_ref('../storage/sentence_disambiguation/brown_tag_order.pkl')) as fp:
disambiguator_tag_order = cPickle.load(fp)
# glove embedding params
print('[{}] Loading <embedder_weights>'.format(str(datetime.now())))
embedder_weights = np.load(local_ref('../storage/word_embedding/glove_weights_300d.npy'))
print('[{}] Loading <embedder_vocab>'.format(str(datetime.now())))
with open(local_ref('../storage/word_embedding/glove_vocab_300d.pkl')) as fp:
embedder_vocab = cPickle.load(fp)
# part-of-speech params
print('[{}] Loading <pos_tagger_weights>'.format(str(datetime.now())))
pos_tagger_weights = dict(np.load(local_ref('../storage/pos_tagger/pos_trained_weights.npz')))
print('[{}] Loading <wordvec_model>'.format(str(datetime.now())))
wordvec_model = models.Word2Vec.load_word2vec_format(
local_ref('../storage/pos_tagger/GoogleNews-vectors-negative300.bin'), binary=True)
# NER params
print('[{}] Loading <ner_gen_params>'.format(str(datetime.now())))
with open(local_ref('../storage/ner/gen_params_set.pkl')) as fp:
ner_gen_params = cPickle.load(fp)
print('[{}] Loading <ner_nn_params>'.format(str(datetime.now())))
with open(local_ref('../storage/ner/nn_params_set.dill')) as fp:
ner_nn_params = dill.load(fp)
# stanford dep parser params
print('[{}] Loading <dep_path_to_jar>'.format(str(datetime.now())))
dep_path_to_jar = local_ref('../storage/dependency_parsing/stanford-parser.jar')
print('[{}] Loading <dep_path_to_models_jar>'.format(str(datetime.now())))
dep_path_to_models_jar = local_ref('../storage/dependency_parsing/stanford-parser-3.5.2-models.jar')
self.bank['disambiguator_weights'] = disambiguator_weights
self.bank['disambiguator_tag_counts'] = disambiguator_tag_counts
self.bank['disambiguator_tag_order'] = disambiguator_tag_order
self.bank['embedder_weights'] = embedder_weights
self.bank['embedder_vocab'] = embedder_vocab
self.bank['pos_tagger_weights'] = pos_tagger_weights
self.bank['wordvec_model'] = wordvec_model
self.bank['ner_gen_params'] = ner_gen_params
self.bank['ner_nn_params'] = ner_nn_params
self.bank['dep_path_to_jar'] = dep_path_to_jar
self.bank['dep_path_to_models_jar'] = dep_path_to_models_jar
def gen_dataset(sentences,
categories,
max_words=78,
train_test_split=True):
''' Generate a dataset of (input, output) pairs where the
input is an embedded vector and output the category (one-hotted)
Args
----
sentences : list
list of sentences where each sentence is list of tokens
max_words : integer
maximum number of words allowed in sentence
train_test_split : boolean
whether to split data into 2 sets
'''
num_sentences = len(sentences)
model = models.Word2Vec.load_word2vec_format(
local_ref('../storage/pos_tagger/GoogleNews-vectors-negative300.bin'),
binary=True)
vectorizer = lambda x: model[x] if x in model else np.zeros(300)
encoder = one_hot_encoding(categories)
X = np.zeros((num_sentences, max_words, 300))
y = np.zeros((num_sentences, max_words, len(encoder.keys())))
K = np.zeros(num_sentences)
I = np.arange(num_sentences)
param_dict = {}
param_dict['max_words'] = max_words
param_dict['encoder'] = encoder
for sent_i in I:
words = sentences[sent_i]
cats = categories[sent_i]
if sent_i % 1000 == 0:
print("{} sentences parsed. {} remaining.".format(
sent_i, num_sentences - sent_i - 1))
X[sent_i, :, :], y[sent_i, :, :] = \
prepare_sentence(words, categories=cats,
vectorizer=vectorizer,
encoder=encoder,
max_words=max_words)
K[sent_i] = len(words) # keep track of num words in sentence
if train_test_split:
(X_train, X_test), (I_train, I_test) = util.split_data(
X, out_data=I, frac=0.80)
y_train, y_test = y[I_train], y[I_test]
K_train, K_test = K[I_train], K[I_test]
return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict
return (X, y, K), param_dict
def gen_dataset(sentences,
max_words=78,
train_test_split=True):
''' Generate a dataset of (input, output) pairs where the
input is an embedded vector and output the category (one-hotted)
Args
----
sentences : list
list of sentences where each sentence is list of tokens
max_words : integer
maximum number of words allowed in sentence
train_test_split : boolean
whether to split data into 2 sets
'''
num_sentences = len(sentences)
model = models.Word2Vec.load_word2vec_format(
'../storage/pos_tagger/GoogleNews-vectors-negative300.bin',
binary=True)
vectorizer = lambda x: model[x] if x in model else np.ones(300)*ZERO_EPSILON
wordnet_lemmatizer = WordNetLemmatizer()
lemmatizer = lambda x: wordnet_lemmatizer.lemmatize(x)
X = np.zeros((num_sentences, max_words, 300))
y = np.zeros((num_sentences, max_words, 300))
K = np.zeros(num_sentences)
I = np.arange(num_sentences)
param_dict = {}
param_dict['max_words'] = max_words
for sent_i in I:
words = sentences[sent_i]
if sent_i % 1000 == 0:
print("{} sentences parsed. {} remaining.".format(
sent_i, num_sentences - sent_i - 1))
X[sent_i, :, :], y[sent_i, :, :] = \
prepare_sentence(words, vectorizer=vectorizer,
lemmatizer=lemmatizer,
max_words=max_words)
K[sent_i] = len(words) # keep track of num words in sentence
if train_test_split:
(X_train, X_test), (I_train, I_test) = util.split_data(
X, out_data=I, frac=0.80)
y_train, y_test = y[I_train], y[I_test]
K_train, K_test = K[I_train], K[I_test]
return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict
return (X, y, K), param_dict
def _expand_vocabulary(skip_thoughts_emb, skip_thoughts_vocab, word2vec):
# Find words shared between the two vocabularies.
print("Finding shared words")
shared_words = [w for w in word2vec.vocab if w in skip_thoughts_vocab]
# Select embedding vectors for shared words.
print("Selecting embeddings for %d shared words" % len(shared_words))
shared_st_emb = skip_thoughts_emb[[
skip_thoughts_vocab[w] for w in shared_words]]
shared_w2v_emb = word2vec[shared_words]
# Train a linear regression model on the shared embedding vectors.
print("Training linear regression model")
model = sklearn.linear_model.LinearRegression()
model.fit(shared_w2v_emb, shared_st_emb)
# Create the expanded vocabulary.
print("Creating embeddings for expanded vocabulary")
embedding_map = collections.OrderedDict()
print('Length of word2vec vocabulary: %d\n' % len(word2vec.vocab))
for i, w in enumerate(word2vec.vocab):
print('\rEmbedding %d' %(i+1), end=' ')
# Ignore words with underscores (spaces).
if "_" not in w:
w_emb = model.predict(word2vec[w].reshape(1, -1))
embedding_map[w] = w_emb.reshape(-1)
for w in skip_thoughts_vocab:
embedding_map[w] = skip_thoughts_emb[skip_thoughts_vocab[w]]
print("Created expanded vocabulary of %d words", len(embedding_map))
expanded_vocab = {}
expanded_embeddings = np.zeros([len(embedding_map), paras.embedding_size])
for i, w in enumerate(embedding_map.keys()):
expanded_vocab[w] = i
expanded_embeddings[i,:] = embedding_map[w]
print('Saving expanded vocab and embeddings')
with open(path + 'expanded_vocab.pkl', 'wb') as f:
pkl.dump(expanded_vocab, f)
embeddings_file = os.path.join(path, "expanded_embeddings.npy")
np.save(embeddings_file, expanded_embeddings)
return expanded_vocab, expanded_embeddings
# path = '../models/toronto_n5/'
def train_rnas(seq_file = 'utrs.fa', outfile= 'rnadocEmbedding25.pickle'):
min_count = 5
dim = 50
window = 5
print('dim: ' + str(dim) + ', window: ' + str(window))
seq_dict = read_fasta_file(seq_file)
#text = seq_dict.values()
tris = get_6_trids()
sentences = []
for seq in seq_dict.values():
seq = seq.replace('T', 'U')
bag_sen = []
bag_seqs = split_overlap_seq(seq)
for new_seq in bag_seqs:
trvec = get_4_nucleotide_composition(tris, new_seq)
bag_sen.append(trvec)
#for aa in range(len(text)):
sentences.append(bag_sen)
#pdb.set_trace()
print(len(sentences))
model = None
docs = train_tag_doc(sentences)
#model = Word2Vec(sentences, min_count=min_count, size=dim, window=window, sg=1, iter = 10, batch_words=100)
#model = gensim.models.doc2vec.Doc2Vec(docs, size = 50, window = 300, min_count = min_count, workers = 4)
model = gensim.models.doc2vec.Doc2Vec(size=50, min_count=min_count, iter=50)
model.build_vocab(docs)
model.train(docs)
'''vocab = list(model.vocab.keys())
print vocab
fw = open('rna_doc_dict', 'w')
for val in vocab:
fw.write(val + '\n')
fw.close()
#print model.syn0
#pdb.set_trace()
embeddingWeights = np.empty([len(vocab), dim])
for i in range(len(vocab)):
embeddingWeights[i,:] = model[vocab[i]]
allWeights.append(embeddingWeights)
'''
#model.infer_vector(['only', 'you', 'can', 'prevent', 'forrest', 'fires'])
#with open(outfile, 'w') as f:
# pickle.dump(model, f)
# store the model to mmap-able files
pdb.set_trace()
model.save(outfile)
# load the model back
#model_loaded = Doc2Vec.load(outfile)
sup_parser_v4_hierarchy_cnn.py 文件源码
项目:conll16st-hd-sdp
作者: tbmihailov
项目源码
文件源码
阅读 15
收藏 0
点赞 0
评论 0
def filter_items_train_classifier_and_save_model_logreg(classifier_name, class_mapping_curr, relation_type,
train_x, train_y_txt,
train_y_relation_types, save_model_file):
"""
Filters items by given params, trains the classifier and saves the word2vec_model to a file.
Args:
classifier_name: Name of the classifier used for saving the models
class_mapping_curr: Class mapping to map train_y_txt to int. Filters items
relation_type: 1 Explicit, 0 Non Explicit, Filters items with this relation type only
train_x: Train samples
train_y_txt: Train sample classes - Text class that will be filtered using class_mapping_curr dict
train_y_relation_types: Train type indicators if sample is explicit or implicit.
Only items with relation_type will be used for training
save_model_file: Name of the file in which the word2vec_model will be saved
Returns:
Filters items and trains classifier
"""
logging.info('======[%s] - filter_items_train_classifier_and_save_model_logreg======' % classifier_name)
train_x_curr = []
train_y_curr = []
# Filtering items
logging.info('Filtering %s items...' % len(train_x))
start = time.time()
for i in range(0, len(train_x)):
if train_y_txt[i] in class_mapping_curr and train_y_relation_types[i] == relation_type:
train_x_curr.append(train_x[i])
train_y_curr.append(class_mapping_curr[train_y_txt[i]])
end = time.time()
logging.info("Done in %s s" % (end - start))
# Training
# Classifier params
classifier_current = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
print 'Classifier:\n%s' % classifier_current
start = time.time()
logging.info('Training with %s items...' % len(train_x_curr))
classifier_current.fit(train_x_curr, train_y_curr)
end = time.time()
logging.info("Done in %s s" % (end - start))
# Saving word2vec_model
pickle.dump(classifier_current, open(save_model_file, 'wb'))
logging.info('Model saved to %s' % save_model_file)
sup_parser_v6_hierarchy_cnn_cross.py 文件源码
项目:conll16st-hd-sdp
作者: tbmihailov
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def filter_items_train_classifier_and_save_model_logreg(classifier_name, class_mapping_curr, relation_type,
train_x, train_y_txt,
train_y_relation_types, save_model_file):
"""
Filters items by given params, trains the classifier and saves the word2vec_model to a file.
Args:
classifier_name: Name of the classifier used for saving the models
class_mapping_curr: Class mapping to map train_y_txt to int. Filters items
relation_type: 1 Explicit, 0 Non Explicit, Filters items with this relation type only
train_x: Train samples
train_y_txt: Train sample classes - Text class that will be filtered using class_mapping_curr dict
train_y_relation_types: Train type indicators if sample is explicit or implicit.
Only items with relation_type will be used for training
save_model_file: Name of the file in which the word2vec_model will be saved
Returns:
Filters items and trains classifier
"""
logging.info('======[%s] - filter_items_train_classifier_and_save_model_logreg======' % classifier_name)
train_x_curr = []
train_y_curr = []
# Filtering items
logging.info('Filtering %s items...' % len(train_x))
start = time.time()
for i in range(0, len(train_x)):
if train_y_txt[i] in class_mapping_curr and train_y_relation_types[i] == relation_type:
train_x_curr.append(train_x[i])
train_y_curr.append(class_mapping_curr[train_y_txt[i]])
end = time.time()
logging.info("Done in %s s" % (end - start))
# Training
# Classifier params
#classifier_current = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
# degree=3, gamma='auto', kernel='rbf',
# max_iter=-1, probability=False, random_state=None, shrinking=True,
# tol=0.001, verbose=False)
param_c = 0.1
classifier_current = LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=param_c, fit_intercept=True,
intercept_scaling=1, class_weight=None, random_state=None,
solver='liblinear',
max_iter=100, multi_class='ovr', verbose=0, warm_start=False, n_jobs=8)
print 'Classifier:\n%s' % classifier_current
start = time.time()
logging.info('Training with %s items...' % len(train_x_curr))
classifier_current.fit(train_x_curr, train_y_curr)
end = time.time()
logging.info("Done in %s s" % (end - start))
# Saving word2vec_model
pickle.dump(classifier_current, open(save_model_file, 'wb'))
logging.info('Model saved to %s' % save_model_file)
sup_parser_v3_hierarchy_cnn.py 文件源码
项目:conll16st-hd-sdp
作者: tbmihailov
项目源码
文件源码
阅读 17
收藏 0
点赞 0
评论 0
def filter_items_train_classifier_and_save_model_logreg(classifier_name, class_mapping_curr, relation_type,
train_x, train_y_txt,
train_y_relation_types, save_model_file):
"""
Filters items by given params, trains the classifier and saves the word2vec_model to a file.
Args:
classifier_name: Name of the classifier used for saving the models
class_mapping_curr: Class mapping to map train_y_txt to int. Filters items
relation_type: 1 Explicit, 0 Non Explicit, Filters items with this relation type only
train_x: Train samples
train_y_txt: Train sample classes - Text class that will be filtered using class_mapping_curr dict
train_y_relation_types: Train type indicators if sample is explicit or implicit.
Only items with relation_type will be used for training
save_model_file: Name of the file in which the word2vec_model will be saved
Returns:
Filters items and trains classifier
"""
logging.info('======[%s] - filter_items_train_classifier_and_save_model_logreg======' % classifier_name)
train_x_curr = []
train_y_curr = []
# Filtering items
logging.info('Filtering %s items...' % len(train_x))
start = time.time()
for i in range(0, len(train_x)):
if train_y_txt[i] in class_mapping_curr and train_y_relation_types[i] == relation_type:
train_x_curr.append(train_x[i])
train_y_curr.append(class_mapping_curr[train_y_txt[i]])
end = time.time()
logging.info("Done in %s s" % (end - start))
# Training
# Classifier params
classifier_current = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
print 'Classifier:\n%s' % classifier_current
start = time.time()
logging.info('Training with %s items...' % len(train_x_curr))
classifier_current.fit(train_x_curr, train_y_curr)
end = time.time()
logging.info("Done in %s s" % (end - start))
# Saving word2vec_model
pickle.dump(classifier_current, open(save_model_file, 'wb'))
logging.info('Model saved to %s' % save_model_file)
sup_parser_v5_hierarchy_cnn_cross.py 文件源码
项目:conll16st-hd-sdp
作者: tbmihailov
项目源码
文件源码
阅读 15
收藏 0
点赞 0
评论 0
def filter_items_train_classifier_and_save_model_logreg(classifier_name, class_mapping_curr, relation_type,
train_x, train_y_txt,
train_y_relation_types, save_model_file):
"""
Filters items by given params, trains the classifier and saves the word2vec_model to a file.
Args:
classifier_name: Name of the classifier used for saving the models
class_mapping_curr: Class mapping to map train_y_txt to int. Filters items
relation_type: 1 Explicit, 0 Non Explicit, Filters items with this relation type only
train_x: Train samples
train_y_txt: Train sample classes - Text class that will be filtered using class_mapping_curr dict
train_y_relation_types: Train type indicators if sample is explicit or implicit.
Only items with relation_type will be used for training
save_model_file: Name of the file in which the word2vec_model will be saved
Returns:
Filters items and trains classifier
"""
logging.info('======[%s] - filter_items_train_classifier_and_save_model_logreg======' % classifier_name)
train_x_curr = []
train_y_curr = []
# Filtering items
logging.info('Filtering %s items...' % len(train_x))
start = time.time()
for i in range(0, len(train_x)):
if train_y_txt[i] in class_mapping_curr and train_y_relation_types[i] == relation_type:
train_x_curr.append(train_x[i])
train_y_curr.append(class_mapping_curr[train_y_txt[i]])
end = time.time()
logging.info("Done in %s s" % (end - start))
# Training
# Classifier params
classifier_current = SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
degree=3, gamma='auto', kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
print 'Classifier:\n%s' % classifier_current
start = time.time()
logging.info('Training with %s items...' % len(train_x_curr))
classifier_current.fit(train_x_curr, train_y_curr)
end = time.time()
logging.info("Done in %s s" % (end - start))
# Saving word2vec_model
pickle.dump(classifier_current, open(save_model_file, 'wb'))
logging.info('Model saved to %s' % save_model_file)