xmlannotations.py 文件源码

python
阅读 22 收藏 0 点赞 0 评论 0

项目:open-sesame 作者: Noahs-ARK 项目源码 文件源码
def normalize_tokens(self):
        if len(self.stindices) != len(self.enindices):
            sys.stderr.write("\t\tIssue: overlapping tokenization for multiple tokens\n")
            return
        start = {}
        idx = 0
        for s in sorted(self.stindices):
            self.stindices[s] = idx
            start[idx] = s
            idx += 1
        end = {}
        idx = 0
        for t in sorted(self.enindices):
            self.enindices[t] = idx
            end[idx] = t
            if idx > 0 and end[idx - 1] > start[idx]:
                sys.stderr.write("\t\tIssue: overlapping tokenization of neighboring tokens\n")
                return
            token = self.text[start[idx] : t + 1].strip()
            if " " in token:
                sys.stderr.write("\t\tIssue: incorrect tokenization "  + token + "\n")
                return
            if token == "": continue
            self.tokens.append(token)
            idx += 1
        try:
            self.nltkpostags = [ele[1] for ele in pos_tag(self.tokens)]
            for idx in xrange(len(self.tokens)):
                tok = self.tokens[idx]
                if self.nltkpostags[idx].startswith("V"):
                    self.nltklemmas.append(lemmatizer.lemmatize(tok, pos='v'))
                else:
                    self.nltklemmas.append(lemmatizer.lemmatize(tok))
        except IndexError:
            print self.tokens
            print pos_tag(self.tokens)
        return True
评论列表
文章目录


问题


面经


文章

微信
公众号

扫码关注公众号