def POStagging(self):
#?????????????????????????
fin = open('../file/entity_signature.txt', 'r')
fout = open('../file/pos_signature.txt', 'w+')
lemmatizer = WordNetLemmatizer()
j = 0#????????????????????
num = 0
while True:
line = fin.readline()
if line:
if '***' in line:
#print j, num
fout.write(line)
pro_num, pro = line.split('.')
pro, num = pro.split()
pro1, pro2 = pro.split('***')
j = 0#???????????
elif '------' in line:
fout.write(line)
else:
# split text into tokens
#??
num, line = line.split(':', 1)
fout.write(num + ':')
text_tokens = nltk.word_tokenize(line)
t = 0
# tag the sentence, using the default NTLK English tagger
# POS_TAGGER = 'taggers/maxent_treebank_pos_tagger/english.pickle'
sentence_tag = nltk.pos_tag(text_tokens)
for i in range(len(sentence_tag)):
word = sentence_tag[i][0]
tag = sentence_tag[i][1]
if word == 'Entity1':
fout.write('#' + pro1 + '# ')
elif word == 'Entity2':
fout.write('#' + pro2 + '# ')
else:
if (re.match('(V|N)', tag)) and (not re.match('(NNP)', tag)):
#if re.match('(V|N)', tag):
#if re.match('V', tag):
word = lemmatizer.lemmatize(word)
t = t + 1
fout.write(word + ' ')
fout.write('\n')
if t > 0:
j = j + 1
else:
break
fin.close()
fout.close()
评论列表
文章目录