def add_unk(self, thresh=0, unk_string='<UNK>'):
if unk_string in self.s2t.keys(): raise Exception("tried to add an UNK token that already existed")
if self.unk is not None: raise Exception("already added an UNK token")
strings = [unk_string]
for token in self.tokens:
if token.count >= thresh: strings.append(token.s)
if self.START_TOK is not None and self.START_TOK not in strings: strings.append(self.START_TOK.s)
if self.END_TOK is not None and self.END_TOK not in strings: strings.append(self.END_TOK.s)
self.tokens = set([])
self.strings = set([])
self.i2t = defaultdict(lambda :self.unk)
self.s2t = defaultdict(lambda :self.unk)
for string in strings:
self.add_string(string)
self.unk = self.s2t[unk_string]
if self.START_TOK is not None: self.START_TOK = self.s2t[self.START_TOK.s]
if self.END_TOK is not None: self.END_TOK = self.s2t[self.END_TOK.s]
评论列表
文章目录