def __call__(self, text):
'''
@param text: the string of text to be tagged
@returns: a list of tags respecting the order in the text
'''
sentences = nltk.sent_tokenize(text)
punctuation = set(string.punctuation)
proper_noun = lambda x: True if x == 'NN' else False
tags = []
#Giving importance to first sentece words.
if len(sentences) > 0:
#stripping away punctuation
words = nltk.pos_tag([word.lower() for word in nltk.wordpunct_tokenize(sentences[0]) if word not in punctuation])
if len(words) > 1:
tags.append(Tag(str(words[0][0])))
for word, tag in words[1:-1]:
tags.append(Tag(str(word), proper=proper_noun(tag)))
tags.append(Tag(str(words[-1][0]),
proper=proper_noun(str(words[-1][1])),
terminal=True))
elif len(words) == 1:
tags.append(Tag(str(words[0][0]), terminal=True))
#Rest of the sentences
for sent in sentences[1:]:
words = nltk.pos_tag([word.lower() for word in nltk.wordpunct_tokenize(sent) if word not in punctuation])
if len(words) > 1:
for word,tag in words[:-1]:
tags.append(Tag(str(word), proper=proper_noun(tag)))
if len(words) > 0:
tags.append(Tag(str(words[-1][0]),
proper=proper_noun(str(words[-1][1])),
terminal=True))
return tags
评论列表
文章目录