def word_lemma(doc_unis_list):
wnl = WordNetLemmatizer()
doc_stems_list = []
for doc_unis in doc_unis_list:
doc_stems = []
for uni in doc_unis:
stem_uni = wnl.lemmatize(uni)
doc_stems.append(stem_uni)
doc_stems_list.append(doc_stems)
return doc_stems_list
########## Text Statistic Fuctions ##########
python类WordNetLemmatizer()的实例源码
def POStagging(self):
#?????????????????????????
fin = open('../file/entity_signature.txt', 'r')
fout = open('../file/pos_signature.txt', 'w+')
lemmatizer = WordNetLemmatizer()
j = 0#????????????????????
num = 0
while True:
line = fin.readline()
if line:
if '***' in line:
#print j, num
fout.write(line)
pro_num, pro = line.split('.')
pro, num = pro.split()
pro1, pro2 = pro.split('***')
j = 0#???????????
elif '------' in line:
fout.write(line)
else:
# split text into tokens
#??
num, line = line.split(':', 1)
fout.write(num + ':')
text_tokens = nltk.word_tokenize(line)
t = 0
# tag the sentence, using the default NTLK English tagger
# POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
sentence_tag = nltk.pos_tag(text_tokens)
for i in range(len(sentence_tag)):
word = sentence_tag[i][0]
tag = sentence_tag[i][1]
if word == 'Entity1':
fout.write('#' + pro1 + '# ')
elif word == 'Entity2':
fout.write('#' + pro2 + '# ')
else:
if (re.match('(V|N)', tag)) and (not re.match('(NNP)', tag)):
#if re.match('(V|N)', tag):
#if re.match('V', tag):
word = lemmatizer.lemmatize(word)
t = t + 1
fout.write(word + ' ')
fout.write('\n')
if t > 0:
j = j + 1
else:
break
fin.close()
fout.close()
def gen_dataset(sentences,
max_words=78,
train_test_split=True):
''' Generate a dataset of (input, output) pairs where the
input is an embedded vector and output the category (one-hotted)
Args
----
sentences : list
list of sentences where each sentence is list of tokens
max_words : integer
maximum number of words allowed in sentence
train_test_split : boolean
whether to split data into 2 sets
'''
num_sentences = len(sentences)
model = models.Word2Vec.load_word2vec_format(
'../storage/pos_tagger/GoogleNews-vectors-negative300.bin',
binary=True)
vectorizer = lambda x: model[x] if x in model else np.ones(300)*ZERO_EPSILON
wordnet_lemmatizer = WordNetLemmatizer()
lemmatizer = lambda x: wordnet_lemmatizer.lemmatize(x)
X = np.zeros((num_sentences, max_words, 300))
y = np.zeros((num_sentences, max_words, 300))
K = np.zeros(num_sentences)
I = np.arange(num_sentences)
param_dict = {}
param_dict['max_words'] = max_words
for sent_i in I:
words = sentences[sent_i]
if sent_i % 1000 == 0:
print("{} sentences parsed. {} remaining.".format(
sent_i, num_sentences - sent_i - 1))
X[sent_i, :, :], y[sent_i, :, :] = \
prepare_sentence(words, vectorizer=vectorizer,
lemmatizer=lemmatizer,
max_words=max_words)
K[sent_i] = len(words) # keep track of num words in sentence
if train_test_split:
(X_train, X_test), (I_train, I_test) = util.split_data(
X, out_data=I, frac=0.80)
y_train, y_test = y[I_train], y[I_test]
K_train, K_test = K[I_train], K[I_test]
return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict
return (X, y, K), param_dict
def gen_dataset(sentences,
max_words=78,
train_test_split=True):
''' Generate a dataset of (input, output) pairs where the
input is an embedded vector and output is
an embedded vector for the lemmatized form.
Args
----
sentences : list
list of sentences where each sentence is list of tokens
max_words : integer
maximum number of words allowed in sentence
train_test_split : boolean
whether to split data into 2 sets
'''
num_sentences = len(sentences)
model = models.Word2Vec.load_word2vec_format(
'../storage/pos_tagger/GoogleNews-vectors-negative300.bin',
binary=True)
vectorizer = lambda x: model[x] if x in model else np.ones(300)*ZERO_EPSILON
lemmatizer = WordNetLemmatizer().lemmatize
X = np.zeros((num_sentences, max_words, 300))
y = np.zeros((num_sentences, max_words, 300))
K = np.zeros(num_sentences)
I = np.arange(num_sentences)
param_dict = {}
param_dict['max_words'] = max_words
for sent_i, words in enumerate(sentences):
if sent_i % 1000 == 0:
print("{} sentences parsed. {} remaining.".format(
sent_i, num_sentences - sent_i - 1))
X[sent_i, :, :], y[sent_i, :, :] = \
prepare_sentence(words,
vectorizer=vectorizer,
lemmatizer=lemmatizer,
max_words=max_words)
K[sent_i] = len(words) # keep track of num words in sentence
if train_test_split:
(X_train, X_test), (I_train, I_test) = split_data(
X, out_data=I, frac=0.80)
y_train, y_test = y[I_train], y[I_test]
K_train, K_test = K[I_train], K[I_test]
return (X_train, X_test), (y_train, y_test), (K_train, K_test), param_dict
return (X, y, K), param_dict