tagger.py 文件源码-python代码片段

def __call__(self, text):
        '''
        @param text: the string of text to be tagged

        @returns: a list of tags respecting the order in the text
        '''


        sentences = nltk.sent_tokenize(text)
        punctuation = set(string.punctuation)
        proper_noun = lambda x: True if x == 'NN' else False

        tags = []

        #Giving importance to first sentece words.
        if len(sentences) > 0:
            #stripping away punctuation
            words = nltk.pos_tag([word.lower() for word in nltk.wordpunct_tokenize(sentences[0]) if word not in punctuation])

            if len(words) > 1:
                tags.append(Tag(str(words[0][0])))
                for word, tag in words[1:-1]:
                    tags.append(Tag(str(word), proper=proper_noun(tag)))
                tags.append(Tag(str(words[-1][0]),
                                proper=proper_noun(str(words[-1][1])),
                                terminal=True))
            elif len(words) == 1:
                tags.append(Tag(str(words[0][0]), terminal=True))

        #Rest of the sentences
        for sent in sentences[1:]:
            words = nltk.pos_tag([word.lower() for word in nltk.wordpunct_tokenize(sent) if word not in punctuation])
            if len(words) > 1:
                for word,tag in words[:-1]:
                    tags.append(Tag(str(word), proper=proper_noun(tag)))
            if len(words) > 0:
                tags.append(Tag(str(words[-1][0]),
                                proper=proper_noun(str(words[-1][1])),
                                terminal=True))
        return tags