def normalize_tokens(self):
if len(self.stindices) != len(self.enindices):
sys.stderr.write("\t\tIssue: overlapping tokenization for multiple tokens\n")
return
start = {}
idx = 0
for s in sorted(self.stindices):
self.stindices[s] = idx
start[idx] = s
idx += 1
end = {}
idx = 0
for t in sorted(self.enindices):
self.enindices[t] = idx
end[idx] = t
if idx > 0 and end[idx - 1] > start[idx]:
sys.stderr.write("\t\tIssue: overlapping tokenization of neighboring tokens\n")
return
token = self.text[start[idx] : t + 1].strip()
if " " in token:
sys.stderr.write("\t\tIssue: incorrect tokenization " + token + "\n")
return
if token == "": continue
self.tokens.append(token)
idx += 1
try:
self.nltkpostags = [ele[1] for ele in pos_tag(self.tokens)]
for idx in xrange(len(self.tokens)):
tok = self.tokens[idx]
if self.nltkpostags[idx].startswith("V"):
self.nltklemmas.append(lemmatizer.lemmatize(tok, pos='v'))
else:
self.nltklemmas.append(lemmatizer.lemmatize(tok))
except IndexError:
print self.tokens
print pos_tag(self.tokens)
return True
评论列表
文章目录